<a href="https://colab.research.google.com/github/danielrobl3s/ML-notebooks/blob/master/New_York_Average_Temperature_(Multiple_linear_regression).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Average temperature of New York city using linear regression, multiple linear regression, polynomial regression and support vector regression

In [58]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import mean_squared_error

#Importing the dataset

In [27]:
dataset = pd.read_csv('nyc_temperature.csv')

In [38]:
#Replace 'T' (the missing values), to be null to be taken care of later
dataset.replace('T', 'NaN', inplace=True)

X = dataset.loc[:,:]
X = dataset.drop(columns=['date', 'tavg'])
X = X.values
Y = dataset.loc[:,'tavg'].values

In [34]:
print(X)

[[60 40 13.9 ... '0.08' '0' '0']
 [41 35 2.1 ... '0' '0' '0']
 [45 39 6.3 ... 'NaN' '0' '0']
 ...
 [44 38 4.3 ... '0.29' '0' '0']
 [43 38 4.0 ... '0.49' '0' '0']
 [46 38 5.7 ... '0.01' '0' '0']]


In [35]:
print(Y)

[50.  38.  42.  42.  44.5 40.5 30.5 41.  40.5 32.5 27.  27.5 29.  28.5
 31.  35.5 30.  35.5 35.5 28.  10.5 22.5 38.5 46.  34.5 30.  39.  32.
 34.5 21.5 10.  17.  26.5 41.5 47.5 50.5 40.5 40.5 44.  28.5 30.  34.
 30.5 37.5 39.5 48.5 39.  31.5 34.  30.  29.  42.  40.  35.5 41.  35.5
 31.  26.  31.  30.5 35.5 37.5 33.5 26.5 23.  26.  31.  40.5 39.  46.
 38.5 39.  49.  60.  48.  39.5 39.5 42.5 44.  42.5 43.5 41.5 49.  50.
 40.5 39.  44.  53.  55.5 49.5 39.5 43.5 54.  51.5 42.  52.5 55.  63.
 46.5 51.  46.  55.5 66.5 63.  53.5 54.  54.  56.5 65.5 58.5 57.5 54.5
 64.5 61.  56.  55.5 52.5 49.  51.5 52.5 51.5 59.  50.  59.  51.5 58.
 58.  61.  56.5 64.5 63.5 51.5 46.  48.  58.  64.  70.5 67.  68.  77.
 64.  66.5 66.5 68.5 63.5 75.5 73.5 66.5 62.  62.  71.  71.5 73.  66.
 65.  74.  77.  72.  71.5 69.5 64.5 70.5 68.  61.  68.5 73.  75.5 73.
 70.  68.  70.  71.5 71.5 76.5 78.  79.  79.5 81.  82.  85.  78.  77.5
 81.5 81.  82.  78.5 84.5 79.  75.5 80.  82.5 80.  81.5 80.5 85.  80.5
 83.5 84.  74.5

#Taking care of missing data

In [39]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[:,5:])
X[:,5:] = imputer.transform(X[:,5:])

In [40]:
print(X)

[[60 40 13.9 ... 0.08 0.0 0.0]
 [41 35 2.1 ... 0.0 0.0 0.0]
 [45 39 6.3 ... 0.16249201277955272 0.0 0.0]
 ...
 [44 38 4.3 ... 0.29 0.0 0.0]
 [43 38 4.0 ... 0.49 0.0 0.0]
 [46 38 5.7 ... 0.01 0.0 0.0]]


#Splitting into train and test set

In [55]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

#Training the Multiple Linear Regression model on the Training set

In [42]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(x_train, y_train)

#Predicting the test results

In [54]:
y_pred = regressor.predict(x_test)
#np.set_printoptions(precision=2)
#print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)),1))

print(y_pred)

[54.  71.  48.5 39.  44.  39.5 39.5 36.5 72.5 30.5 51.5 66.5 74.5 67.
 63.  88.  59.  55.5 76.5 64.  66.5 83.  30.5 60.  58.  73.5 39.  37.
 61.  80.  51.5 66.5 35.5 40.5 41.  40.5 35.5 46.  77.  23.  80.  63.
 80.5 64.  10.5 45.  80.5 79.5 76.5 26.  76.5 79.5 70.5 68.5 54.  35.5
 40.  64.  49.5 39.5 41.  74.  41.  46.  44.  48.5 31.  50.  81.  69.5
 48.  70.  56. ]


In [45]:
print(y_test)

[54.  71.  48.5 39.  44.  39.5 39.5 36.5 72.5 30.5 51.5 66.5 74.5 67.
 63.  88.  59.  55.5 76.5 64.  66.5 83.  30.5 60.  58.  73.5 39.  37.
 61.  80.  51.5 66.5 35.5 40.5 41.  40.5 35.5 46.  77.  23.  80.  63.
 80.5 64.  10.5 45.  80.5 79.5 76.5 26.  76.5 79.5 70.5 68.5 54.  35.5
 40.  64.  49.5 39.5 41.  74.  41.  46.  44.  48.5 31.  50.  81.  69.5
 48.  70.  56. ]


In [59]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 5.826764384865337e-29
