In [35]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
import warnings
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error as mse

In [3]:
data = pd.read_csv('globalWarming.csv')
data.drop('Unnamed: 0', axis = 1, inplace = True)
data.head()

Unnamed: 0,date,LandAverageTemperature,LandAverageTemperatureUncertainty,LandMaxTemperature,LandMaxTemperatureUncertainty,LandMinTemperature,LandMinTemperatureUncertainty,LandAndOceanAverageTemperature,LandAndOceanAverageTemperatureUncertainty
0,1850,7.900667,0.876417,13.476667,2.394833,1.964333,1.571167,14.867167,0.308167
1,1851,8.178583,0.881917,13.081,2.39725,2.203917,1.632417,14.991833,0.312083
2,1852,8.100167,0.91825,13.397333,2.61925,2.337,1.382917,15.0065,0.316417
3,1853,8.041833,0.835,13.886583,2.095083,1.8925,1.355583,14.955167,0.283833
4,1854,8.2105,0.825667,13.977417,1.783333,1.762167,1.357,14.991,0.276417


In [5]:
data = data[[c for c in data if c != "LandAverageTemperature"] + ["LandAverageTemperature"]]
data.head()

Unnamed: 0,date,LandAverageTemperatureUncertainty,LandMaxTemperature,LandMaxTemperatureUncertainty,LandMinTemperature,LandMinTemperatureUncertainty,LandAndOceanAverageTemperature,LandAndOceanAverageTemperatureUncertainty,LandAverageTemperature
0,1850,0.876417,13.476667,2.394833,1.964333,1.571167,14.867167,0.308167,7.900667
1,1851,0.881917,13.081,2.39725,2.203917,1.632417,14.991833,0.312083,8.178583
2,1852,0.91825,13.397333,2.61925,2.337,1.382917,15.0065,0.316417,8.100167
3,1853,0.835,13.886583,2.095083,1.8925,1.355583,14.955167,0.283833,8.041833
4,1854,0.825667,13.977417,1.783333,1.762167,1.357,14.991,0.276417,8.2105


In [10]:
data = data[['date','LandAverageTemperatureUncertainty'] + ['LandAndOceanAverageTemperature'] + [c for c in data if c not in ['date','LandAverageTemperatureUncertainty','LandAndOceanAverageTemperature']]]
data.head()

Unnamed: 0,date,LandAverageTemperatureUncertainty,LandAndOceanAverageTemperature,LandMaxTemperature,LandMaxTemperatureUncertainty,LandMinTemperature,LandMinTemperatureUncertainty,LandAndOceanAverageTemperatureUncertainty,LandAverageTemperature
0,1850,0.876417,14.867167,13.476667,2.394833,1.964333,1.571167,0.308167,7.900667
1,1851,0.881917,14.991833,13.081,2.39725,2.203917,1.632417,0.312083,8.178583
2,1852,0.91825,15.0065,13.397333,2.61925,2.337,1.382917,0.316417,8.100167
3,1853,0.835,14.955167,13.886583,2.095083,1.8925,1.355583,0.283833,8.041833
4,1854,0.825667,14.991,13.977417,1.783333,1.762167,1.357,0.276417,8.2105


In [11]:
X = data.iloc[:, 1:3]
y = data.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1)
X.head()

Unnamed: 0,LandAverageTemperatureUncertainty,LandAndOceanAverageTemperature
0,0.876417,14.867167
1,0.881917,14.991833
2,0.91825,15.0065
3,0.835,14.955167
4,0.825667,14.991


In [12]:
warnings.filterwarnings('ignore')
lreg = LinearRegression()
sfs1 = sfs(lreg, k_features=2, forward=True, verbose=2, scoring='neg_mean_squared_error')
sfs1 = sfs1.fit(X, y)


[2024-06-25 21:28:30] Features: 1/2 -- score: -0.017429347213889664
[2024-06-25 21:28:30] Features: 2/2 -- score: -0.02042768684169417

In [13]:
feat_names = list(sfs1.k_feature_names_)
print(feat_names)

['LandAverageTemperatureUncertainty', 'LandAndOceanAverageTemperature']


In [14]:
std_scaler = StandardScaler()
std_scaler.fit(X_train, y_train)
linear = LinearRegression()
linear.fit(X_train, y_train)

In [17]:
pred = linear.predict(X_test)
rmse(pred, y_test)

0.11901071427286568

creating a function to get the best combination of random state for the linear model

In [36]:
def best_random_state(r_state, ratio):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = ratio, random_state = r_state)
    model = LinearRegression()
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    return mse(y_test, pred)

In [37]:
random_state = 0
test_size = 0
error = 1
for state in range(2, 20):
    for ratio in range(10, 30, 2):
        test_error = best_random_state(state, ratio/100)
        print(f'{test_error} for random_state = {state} and test_size = {ratio/100}')
        if(error>test_error):
            error = test_error
            random_state = state
            test_size = ratio/100
print('\n')
print(f'least error is {error} for random_state = {random_state} and test_size = {test_size}')


0.01057506782690608 for random_state = 2 and test_size = 0.1
0.010469225761268866 for random_state = 2 and test_size = 0.12
0.01238449109517239 for random_state = 2 and test_size = 0.14
0.013751905101062079 for random_state = 2 and test_size = 0.16
0.013399772463855946 for random_state = 2 and test_size = 0.18
0.013476967609080473 for random_state = 2 and test_size = 0.2
0.01589958145773952 for random_state = 2 and test_size = 0.22
0.015094505398085179 for random_state = 2 and test_size = 0.24
0.014602486462800067 for random_state = 2 and test_size = 0.26
0.013783532189719524 for random_state = 2 and test_size = 0.28
0.013299376767318785 for random_state = 3 and test_size = 0.1
0.015663423564086534 for random_state = 3 and test_size = 0.12
0.015536424180005183 for random_state = 3 and test_size = 0.14
0.015012699529930814 for random_state = 3 and test_size = 0.16
0.014769446986145298 for random_state = 3 and test_size = 0.18
0.013971492254057869 for random_state = 3 and test_size = 0.2

creating a function to get best random_state in randomForestclassfier