# Importing Packages

In [None]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing

%matplotlib inline

# Reading CSV file as weather_df and making date_time column as index of dataframe

In [None]:
# This code reads a CSV file named 'kanpur.csv' into a pandas DataFrame called 'weather_df'.
# The column 'date_time' in the CSV file is parsed as a date and set as the index column of the DataFrame.
weather_df = pd.read_csv('delhi.csv', parse_dates=['date_time'], index_col='date_time')

# This code displays the first 5 rows of the DataFrame to verify that the CSV file was properly loaded.
weather_df.head(5)

# Checking columns in our dataframe

In [None]:
# This code retrieves the column names of a pandas DataFrame called 'weather_df'.
weather_df.columns

## Now shape

In [None]:
#This code retrieves the dimensions of the pandas DataFrame weather_df. The output of this code is a tuple with two elements.
#The first element is the number of rows in the DataFrame and the second element is the number of columns in the DataFrame.
weather_df.shape

In [None]:
# This code generates descriptive statistics of a pandas DataFrame called 'weather_df'.
weather_df.describe()

# This code generates descriptive statistics of the pandas DataFrame weather_df. The describe() method calculates various summary statistics, including count, mean, standard deviation, minimum, maximum, and quartile values for each numerical column in the DataFrame. This is useful for quickly getting a sense of the range and distribution of values in the DataFrame.

# Checking is there any null values in dataset

In [None]:
weather_df.isnull().any()

# Data Visualization

In [None]:
fig, axes=plt.subplots(figsize=(8, 6))
sns.heatmap(weather_df.corr(), ax=axes)

### Now lets separate the feature (i.e. temperature) to be predicted from the rest of the featured. weather_x stores the rest of the dataset while weather_y has temperature column.

In [None]:
# This code selects a subset of numerical columns from a pandas DataFrame called 'weather_df' and creates a new DataFrame called 'weather_df_num'.
# The 'loc' method is used to select all rows and only the columns 'maxtempC', 'mintempC', 'cloudcover', 'humidity', 'tempC', 'sunHour', 'HeatIndexC', 'precipMM', 'pressure', and 'windspeedKmph'.
weather_df_num=weather_df.loc[:,['maxtempC','mintempC','cloudcover','humidity','tempC', 'sunHour','HeatIndexC', 'precipMM', 'pressure','windspeedKmph']]

# This code displays the first 5 rows of the new DataFrame 'weather_df_num' to verify that only the selected columns were included.
weather_df_num.head()

# Shape of new dataframe

In [None]:
weather_df_num.shape

# Columns in new dataframe

In [None]:
weather_df_num.columns

## Ploting all the column values

In [None]:
# This code plots a set of subplots, each representing a numerical column in the weather_df_num dataframe
# The subplots are displayed in a grid format, with the number of rows and columns determined automatically based on the number of columns in the dataframe
# The size of the plot is set to 25 inches by 20 inches using the figsize parameter

# plot the columns of the weather_df_num DataFrame
weather_df_num.plot(subplots=True, figsize=(25,20))

# set the title and axis labels for each subplot
plt.suptitle('Weather Data', fontsize=20)
plt.xlabel('Date')
plt.subplots_adjust(hspace=0.5, wspace=0.3)
for ax, col in zip(plt.gcf().axes, weather_df_num.columns):
    ax.set_title(col)
    ax.set_ylabel(col)
    
# show the plot
plt.show()


# Ploting all the column values for 1 year

In [None]:
# This code filters the weather_df_num dataframe to include only the data from the year 2019 to 2020, using the slicing operator ':' to select a range of rows based on the index
# The resample method is then applied to the resulting dataframe with a frequency of 'D' (daily), which converts the data to a daily frequency
# The fillna method is then applied to forward fill any missing data in the dataframe
# Finally, a set of subplots is created to display each numerical column in the dataframe
# The subplots are displayed in a grid format, with the number of rows and columns determined automatically based on the number of columns in the dataframe
# The size of the plot is set to 25 inches by 20 inches using the figsize parameter

# plot the columns of the weather_df_num DataFrame for the years 2019-2020
weather_df_num['2019':'2020'].resample('D').fillna(method='pad').plot(subplots=True, figsize=(25,20))

# set the title and axis labels for each subplot
plt.suptitle('Weather Data from 2019 to 2020', fontsize=20)
plt.xlabel('Date')
plt.subplots_adjust(hspace=0.5, wspace=0.3)
for ax, col in zip(plt.gcf().axes, weather_df_num.columns):
    ax.set_title(col)
    ax.set_ylabel(col)

# show the plot
plt.show()


In [None]:
# This code creates a histogram of the numerical data in the weather_df_num dataframe
# The 'bins' parameter is set to 10, which means that the data is divided into 10 equally spaced bins
# The resulting histogram is displayed in a square-shaped plot with dimensions of 15 inches by 15 inches
# This code assumes that `weather_df_num` is a Pandas dataframe containing only numerical columns. If `weather_df_num` contains non-numerical columns, this code may not work as expected.

weather_df_num.hist(bins=10, figsize=(15,15))

plt.suptitle('Frequency Histogram', fontsize=20)

In [None]:
# This code filters the weather_df_num dataframe to include only the data from the year 2019 to 2020, using the slicing operator ':' to select a range of rows based on the index
# The resulting subset of the dataframe is assigned to a new variable called 'weth'
# The head() method is then called on 'weth', which displays the first 5 rows of the subset

weth = weather_df_num['2019':'2020']
weth.head()

In [None]:
# This code creates two new variables 'weather_x' and 'weather_y' based on the weather_df_num dataframe
# The 'pop' method is used to remove the 'tempC' column from the original dataframe and assign it to the 'weather_y' variable
# The 'weather_x' variable is assigned the remaining columns of the original dataframe

weather_y = weather_df_num.pop("tempC")
weather_x = weather_df_num

# The resulting weather_x dataframe can be used as input to a machine learning algorithm, while the weather_y series can be used as the target variable to train the algorithm to predict temperature.

### Now our dataset is prepared and it is ready to be fed to the model for training, it’s time to split the dataset into training and testing.

In [None]:
# This code uses the train_test_split function from the scikit-learn library to split
# the weather_x and weather_y datasets into training and testing sets.
# The test_size parameter specifies the percentage of the data that should be used for testing,
# in this case 20%.
# The random_state parameter ensures that the data is split in a reproducible way,
# so that the same random seed is used each time the code is run.
train_X, test_X, train_y, test_y = train_test_split(weather_x, weather_y, test_size=0.2, random_state=4)

In [None]:
train_X.shape

In [None]:
train_y.shape

### train_x has all the features except temperature and train_y has the corresponding temperature for those features. In supervised machine learning we first feed the model with input and associated output and then we check with a new input.

In [None]:
train_y.head()

# Some more Visualization

In [None]:
# This code uses the scatter plot function from the matplotlib library to create a scatter plot
# of the minimum temperature (x-axis) versus the temperature (y-axis) using the weth dataset.
plt.scatter(weth.mintempC, weth.tempC)

# This code adds an x-axis label to the plot.
plt.xlabel("Minimum Temperature")

# This code adds a y-axis label to the plot.
plt.ylabel("Temperature")

# This code displays the plot.
plt.show()

In [None]:
# This code uses the scatter plot function from the matplotlib library to create a scatter plot
# of the Heat Index (x-axis) versus the temperature (y-axis) using the weth dataset.
plt.scatter(weth.HeatIndexC, weth.tempC)

# This code adds an x-axis label to the plot.
plt.xlabel("Heat Index")

# This code adds a y-axis label to the plot.
plt.ylabel("Temperature")

# This code displays the plot.
plt.show()

In [None]:
# This code uses the scatter plot function from the matplotlib library to create a scatter plot
# of the pressure (x-axis) versus the temperature (y-axis) using the weth dataset.
plt.scatter(weth.pressure, weth.tempC)

# This code adds an x-axis label to the plot.
plt.xlabel("Pressure")

# This code adds a y-axis label to the plot.
plt.ylabel("Temperature")

# This code displays the plot.
plt.show()

In [None]:
# This code uses the scatter plot function from the matplotlib library to create a scatter plot
# of the minimum temperature (x-axis) versus the temperature (y-axis) using the weth dataset.
plt.scatter(weth.mintempC, weth.tempC)

# This code adds an x-axis label to the plot.
plt.xlabel("Minimum Temperature")

# This code adds a y-axis label to the plot.
plt.ylabel("Temperature")

# This code displays the plot.
plt.show()

# Multiple Linear Regression

In [None]:
# This code creates a new instance of a LinearRegression model using scikit-learn's LinearRegression class.
model = LinearRegression()

# This code trains the model using the fit() method, which takes the training input data train_X
# and the corresponding training output data train_y as arguments.
# The model then learns the relationship between the input and output data and is ready to make predictions.
model.fit(train_X, train_y)

In [None]:
# This code uses the predict() method of the trained model to make predictions on the test_X dataset.
# The model takes the test_X input data as input and generates predicted values for the output variable.
prediction = model.predict(test_X)

In [None]:
#calculating error
np.mean(np.absolute(prediction-test_y))

In [None]:
print('Variance score: %.2f' % model.score(test_X, test_y))

In [None]:
for i in range(len(prediction)):
  prediction[i]=round(prediction[i],2)
linear_df = pd.DataFrame({'Actual':test_y,'Prediction':prediction,'diff':(test_y-prediction)})
linear_df

In [None]:
# linear_df[['Actual', 'Prediction']].plot(figsize=(25,20))
linear_df['2019':'2020'].resample('D').fillna(method='pad')[['Actual', 'Prediction']].plot(figsize=(25,20))
plt.ylabel('Predicted tempC')
plt.suptitle('Comparision of Predicted temp by Linear regression and Actual Temp', fontsize=20)

# Decision Tree Regression 

In [None]:
# This code imports the DecisionTreeRegressor class from the sklearn.tree module.
from sklearn.tree import DecisionTreeRegressor

# This code creates a new instance of the DecisionTreeRegressor model using the DecisionTreeRegressor class.
# The random_state parameter sets the random seed for reproducibility.
regressor = DecisionTreeRegressor(random_state=0)

# This code trains the model using the fit() method, which takes the training input data train_X
# and the corresponding training output data train_y as arguments.
# The model then learns the relationship between the input and output data and is ready to make predictions.
regressor.fit(train_X, train_y)


In [None]:
# This code uses the predict() method of the DecisionTreeRegressor model to make predictions on the test_X dataset.
# The model takes the test_X input data as input and generates predicted values for the output variable.
prediction2 = regressor.predict(test_X)

# This code calculates the mean absolute error (MAE) between the predicted values and the actual values for the test set.
# The absolute() function is used to calculate the absolute difference between the predicted and actual values,
# and the mean() function is used to calculate the mean of these absolute differences.
# The np.mean() function is used to calculate the overall mean of the absolute differences.
mae = np.mean(np.absolute(prediction2 - test_y))

In [None]:
print('Variance score: %.2f' % regressor.score(test_X, test_y))

In [None]:
for i in range(len(prediction2)):
  prediction2[i]=round(prediction2[i],2)
decisionTree_df = pd.DataFrame({'Actual':test_y,'Prediction':prediction2,'diff':(test_y-prediction2)})
decisionTree_df

In [None]:
decisionTree_df['2019':'2020'].resample('D').fillna(method='pad')[['Actual', 'Prediction']].plot(figsize=(25,20))
plt.ylabel('Predicted tempC')
plt.suptitle('Comparision of Predicted temp by Decision Tree Regression and Actual temp', fontsize=20)

# Random Forest Regression

In [None]:
# This code imports the RandomForestRegressor class from the sklearn.ensemble module.
from sklearn.ensemble import RandomForestRegressor

# This code creates a new instance of the RandomForestRegressor model using the RandomForestRegressor class.
# The max_depth parameter sets the maximum depth of each decision tree in the forest to 90.
# The random_state parameter sets the random seed for reproducibility.
# The n_estimators parameter sets the number of decision trees in the forest to 100.
regr = RandomForestRegressor(max_depth=90, random_state=0, n_estimators=100)

# This code trains the model using the fit() method, which takes the training input data train_X
# and the corresponding training output data train_y as arguments.
# The model then learns the relationship between the input and output data and is ready to make predictions.
regr.fit(train_X, train_y)

In [None]:
# This code uses the predict() method of the RandomForestRegressor model to make predictions on the test_X dataset.
# The model takes the test_X input data as input and generates predicted values for the output variable.
prediction3 = regr.predict(test_X)

# This code calculates the mean absolute error (MAE) between the predicted values and the actual values for the test set.
# The absolute() function is used to calculate the absolute difference between the predicted and actual values,
# and the mean() function is used to calculate the mean of these absolute differences.
# The np.mean() function is used to calculate the overall mean of the absolute differences.
mae = np.mean(np.absolute(prediction3 - test_y))

In [None]:
print('Variance score: %.2f' % regr.score(test_X, test_y))

In [None]:
for i in range(len(prediction3)):
  prediction3[i]=round(prediction3[i],2)
randomForest_df = pd.DataFrame({'Actual':test_y,'Prediction':prediction3,'diff':(test_y-prediction3)})
randomForest_df

In [None]:
randomForest_df['2019':'2020'].resample('D').fillna(method='pad')[['Actual', 'Prediction']].plot(figsize=(25,20))
plt.ylabel('Predicted tempC')
plt.suptitle('Comparision of Predicted temp by Random Forest Regression and Actual temp', fontsize=20)

# Comparision of the difference between Actual and Predicted tempC values predicted by Linear regression, Random Forest Regression and Decision Tree Regression

In [None]:
import matplotlib.pyplot as plt

# plot all three DataFrames in one plot
plt.figure(figsize=(25,20))
plt.plot(linear_df['2019':'2020'].resample('D').fillna(method='pad')['diff'], label='Linear Regression')
plt.plot(randomForest_df['2019':'2020'].resample('D').fillna(method='pad')['diff'], label='Random Forest')
plt.plot(decisionTree_df['2019':'2020'].resample('D').fillna(method='pad')['diff'], label='Decision Tree')

# set the title of the plot
plt.title('Comparision between difference of Actual and predicted temp values of Linear Regression, Random Forest regression and Decision Tree regression')

# set the x-axis label
plt.xlabel('Date')

# set the y-axis label
plt.ylabel('Difference')

# show the legend
plt.legend()

# show the plot
plt.show()

In [None]:
# This code imports the r2_score function from the sklearn.metrics module.
# The r2_score function is used to calculate the R-squared (coefficient of determination) value, which measures the proportion of variance in the output variable that can be explained by the input variables.
from sklearn.metrics import r2_score

# Calculating R2-score for Multiple Linear Regression

In [None]:
print("Mean absolute error: %.2f" % np.mean(np.absolute(prediction - test_y)))
print("Residual sum of squares (MSE): %.2f" % np.mean((prediction - test_y) ** 2))
print("R2-score: %.2f" % r2_score(test_y,prediction ) )

# Calculating R2-score for Decision Tree Regression

In [None]:
print("Mean absolute error: %.2f" % np.mean(np.absolute(prediction2 - test_y)))
print("Residual sum of squares (MSE): %.2f" % np.mean((prediction2 - test_y) ** 2))
print("R2-score: %.2f" % r2_score(test_y,prediction2 ) )

# Calculating R2-score for Random Forest Regression

In [None]:
from sklearn.metrics import r2_score

print("Mean absolute error: %.2f" % np.mean(np.absolute(prediction3 - test_y)))
print("Residual sum of squares (MSE): %.2f" % np.mean((prediction3 - test_y) ** 2))
print("R2-score: %.2f" % r2_score(test_y,prediction3 ) )

# Thank You