In [250]:
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn
from sklearn import linear_model, metrics, preprocessing, decomposition, svm
from sklearn.svm import SVR
from sklearn.metrics import r2_score, accuracy_score, mean_squared_error, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA, KernelPCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split, TimeSeriesSplit,cross_val_score, GridSearchCV
 
import catboost
from catboost import CatBoostRegressor

In [251]:
def split_to_train_and_test(X, y, test_ratio, rand_state):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_ratio, random_state=rand_state)
    return X_train, X_test, y_train, y_test

In [252]:
df = pd.read_csv('Marvel Dataframe after drop and fill12.csv')
for col in df.columns:
    if 'Name' in col:
        df.drop(columns = col, inplace=True)
df

Unnamed: 0,Year,Appearnces,Gender_Male,Gender_Other,M Status_Other,M Status_Single,Hair C_Blond,Hair C_Brown,Hair C_Grey,Hair C_No Hair At All,...,Eye C_Blue,Eye C_Brown,Eye C_Green,Eye C_Other,L status_Deceased,Identity_No Dual,Identity_Public,Identity_Secret,Reality_Other,Citenzenship_Other
0,2021,1.098612,1,0,0,1,0,0,0,0,...,0,1,0,0,0,1,0,0,1,0
1,1988,0.693147,1,0,0,1,0,1,0,0,...,1,0,0,0,0,0,1,0,1,0
2,2008,2.302585,1,0,0,1,1,0,0,0,...,1,0,0,0,0,0,1,0,1,0
3,1968,7.020191,1,0,1,0,0,0,0,1,...,0,0,0,1,0,1,0,0,0,0
4,2012,1.098612,1,0,0,1,0,0,0,1,...,0,0,1,0,1,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24499,1943,0.000000,1,0,0,1,0,1,0,0,...,0,0,0,1,1,1,0,0,0,1
24500,2018,0.000000,1,0,0,1,0,0,0,0,...,0,0,0,1,0,1,0,0,1,1
24501,2021,0.000000,1,0,0,0,0,0,0,0,...,0,0,0,0,1,1,0,0,0,1
24502,2020,0.693147,1,0,0,0,0,0,0,1,...,0,0,0,0,1,1,0,0,0,1


Normilize the Year column with the MinMax method

In [253]:
df['Year'] = MinMaxScaler().fit_transform(np.array(df['Year']).reshape(-1,1))
df.Year.describe

<bound method NDFrame.describe of 0        0.987952
1        0.590361
2        0.831325
3        0.349398
4        0.879518
           ...   
24499    0.048193
24500    0.951807
24501    0.987952
24502    0.975904
24503    0.108434
Name: Year, Length: 24504, dtype: float64>

In [254]:
# Labels are the values we want to predict
labels = df['Appearnces']

# Remove the labels from the features
df = df.drop('Appearnces', axis = 1)

# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = split_to_train_and_test(df, labels, 0.2, 0)

In [236]:
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (19603, 22)
Training Labels Shape: (19603,)
Testing Features Shape: (4901, 22)
Testing Labels Shape: (4901,)


In [237]:
# Initialize LinearRegression model:
lr = LinearRegression().fit(train_features, train_labels)

# Get predictions
predictions = lr.predict(test_features)

non_negative_preditions = list()
for pred in predictions:
    if pred < 0:
        non_negative_preditions.append(0)
    else:
        non_negative_preditions.append(pred)


df_LR_result = pd.DataFrame(columns=['ACTUAL','PREDICTED'])                            
df_LR_result['ACTUAL'] = test_labels                             
df_LR_result['PREDICTED'] = non_negative_preditions

# Displaying results:
df_LR_result

Unnamed: 0,ACTUAL,PREDICTED
14983,0.000000,0.903879
6964,0.000000,0.804230
16589,0.000000,0.681642
20399,1.386294,1.412739
20385,0.000000,1.543874
...,...,...
17696,0.000000,0.928975
16336,1.791759,0.903645
8215,2.639057,1.576405
22170,0.693147,1.768366


In [249]:
# Calculating the R^2 and SSE scores of the prediction:

print(f"LinearRegression R^2 score : {round(r2_score(df_LR_result['ACTUAL'], df_LR_result['PREDICTED']) ,4) }")
print(f"LinearRegression SSE score : {round(np.sqrt(mean_squared_error(df_LR_result['ACTUAL'], df_LR_result['PREDICTED'])) ,4) }")

LinearRegression R^2 score : 0.0951
LinearRegression SSE score : 1.2187


In [239]:
# Initialize RandromForestTree model:
rf = RandomForestRegressor(n_estimators = 200).fit(train_features, train_labels)

# Get predictions
predictions = rf.predict(test_features)

non_negative_preditions = list()
for pred in predictions:
    if pred < 0:
        non_negative_preditions.append(0)
    else:
        non_negative_preditions.append(pred)

df_rf_result = pd.DataFrame(columns=['ACTUAL','PREDICTED'])                            
df_rf_result['ACTUAL'] = test_labels                           
df_rf_result['PREDICTED'] = non_negative_preditions

# Displaying results:
df_rf_result

Unnamed: 0,ACTUAL,PREDICTED
14983,0.000000,3.127010
6964,0.000000,0.610037
16589,0.000000,0.287655
20399,1.386294,0.540810
20385,0.000000,0.934422
...,...,...
17696,0.000000,0.030761
16336,1.791759,0.914674
8215,2.639057,1.013924
22170,0.693147,3.645740


In [247]:
# Calculating the R^2 and SSE scores of the prediction:

print(f"RandomForestRegressor R^2 score : {round(r2_score(df_rf_result['ACTUAL'], df_rf_result['PREDICTED']) ,4) }")
print(f"RandomForestRegressor SSE score : {round(np.sqrt(mean_squared_error(df_rf_result['ACTUAL'], df_rf_result['PREDICTED'])) ,4) }")

RandomForestRegressor R^2 score : 0.0473
RandomForestRegressor SSE score : 1.2504


In [241]:
# Initialize SVR model:
svr = SVR().fit(train_features, train_labels)

# Get predictions
predictions = svr.predict(test_features)

non_negative_preditions = list()
for pred in predictions:
    if pred < 0:
        non_negative_preditions.append(0)
    else:
        non_negative_preditions.append(pred)


df_svr_result = pd.DataFrame(columns=['ACTUAL','PREDICTED'])                          
df_svr_result['ACTUAL'] = test_labels                           
df_svr_result['PREDICTED'] = non_negative_preditions

# Displaying results:
df_svr_result

Unnamed: 0,ACTUAL,PREDICTED
14983,0.000000,0.307611
6964,0.000000,0.384213
16589,0.000000,0.000000
20399,1.386294,0.615508
20385,0.000000,1.550609
...,...,...
17696,0.000000,0.000000
16336,1.791759,0.313866
8215,2.639057,0.637213
22170,0.693147,1.756494


In [248]:
# Calculating the R^2 and SSE scores of the prediction:

print(f"SVR R^2 score : {round(r2_score(df_svr_result['ACTUAL'], df_svr_result['PREDICTED']) ,4) }")
print(f"SVR SSE score : {round(np.sqrt(mean_squared_error(df_svr_result['ACTUAL'], df_svr_result['PREDICTED'])) ,4) }")

SVR R^2 score : 0.0773
SVR SSE score : 1.2306


In [243]:
# Initialize CatBoostRegressor
CBR = CatBoostRegressor().fit(train_features, train_labels)

# Get predictions
predictions = CBR.predict(test_features)

non_negative_preditions = list()
for pred in predictions:
    if pred < 0:
        non_negative_preditions.append(0)
    else:
        non_negative_preditions.append(pred)
        

df_cbr_result = pd.DataFrame(columns=['ACTUAL','PREDICTED'])                          
df_cbr_result['ACTUAL'] = test_labels                           
df_cbr_result['PREDICTED'] = non_negative_preditions

Learning rate set to 0.065519
0:	learn: 1.2939966	total: 14.9ms	remaining: 14.9s
1:	learn: 1.2844873	total: 20.4ms	remaining: 10.2s
2:	learn: 1.2759425	total: 67.7ms	remaining: 22.5s
3:	learn: 1.2687642	total: 74.2ms	remaining: 18.5s
4:	learn: 1.2619737	total: 77.5ms	remaining: 15.4s
5:	learn: 1.2559910	total: 80.2ms	remaining: 13.3s
6:	learn: 1.2511421	total: 82.9ms	remaining: 11.8s
7:	learn: 1.2461756	total: 95.8ms	remaining: 11.9s
8:	learn: 1.2418571	total: 98.3ms	remaining: 10.8s
9:	learn: 1.2377314	total: 103ms	remaining: 10.2s
10:	learn: 1.2340490	total: 125ms	remaining: 11.2s
11:	learn: 1.2313677	total: 148ms	remaining: 12.2s
12:	learn: 1.2282436	total: 163ms	remaining: 12.4s
13:	learn: 1.2255414	total: 178ms	remaining: 12.5s
14:	learn: 1.2229855	total: 190ms	remaining: 12.5s
15:	learn: 1.2205632	total: 195ms	remaining: 12s
16:	learn: 1.2185514	total: 207ms	remaining: 12s
17:	learn: 1.2169273	total: 211ms	remaining: 11.5s
18:	learn: 1.2153491	total: 216ms	remaining: 11.2s
19:	le

170:	learn: 1.1514544	total: 1.48s	remaining: 7.19s
171:	learn: 1.1512977	total: 1.49s	remaining: 7.17s
172:	learn: 1.1510942	total: 1.51s	remaining: 7.22s
173:	learn: 1.1509224	total: 1.51s	remaining: 7.18s
174:	learn: 1.1506912	total: 1.51s	remaining: 7.14s
175:	learn: 1.1505291	total: 1.52s	remaining: 7.11s
176:	learn: 1.1501616	total: 1.53s	remaining: 7.1s
177:	learn: 1.1500463	total: 1.53s	remaining: 7.07s
178:	learn: 1.1498059	total: 1.53s	remaining: 7.04s
179:	learn: 1.1495982	total: 1.54s	remaining: 7.02s
180:	learn: 1.1492579	total: 1.54s	remaining: 6.99s
181:	learn: 1.1490863	total: 1.55s	remaining: 6.95s
182:	learn: 1.1488607	total: 1.55s	remaining: 6.92s
183:	learn: 1.1487616	total: 1.57s	remaining: 6.97s
184:	learn: 1.1485990	total: 1.59s	remaining: 6.99s
185:	learn: 1.1484880	total: 1.63s	remaining: 7.11s
186:	learn: 1.1482597	total: 1.64s	remaining: 7.13s
187:	learn: 1.1480284	total: 1.65s	remaining: 7.14s
188:	learn: 1.1478342	total: 1.66s	remaining: 7.13s
189:	learn: 1

342:	learn: 1.1210901	total: 3.01s	remaining: 5.77s
343:	learn: 1.1209532	total: 3.02s	remaining: 5.76s
344:	learn: 1.1207865	total: 3.02s	remaining: 5.74s
345:	learn: 1.1206315	total: 3.05s	remaining: 5.76s
346:	learn: 1.1205120	total: 3.05s	remaining: 5.74s
347:	learn: 1.1204273	total: 3.06s	remaining: 5.73s
348:	learn: 1.1202886	total: 3.06s	remaining: 5.71s
349:	learn: 1.1200815	total: 3.07s	remaining: 5.7s
350:	learn: 1.1199216	total: 3.07s	remaining: 5.68s
351:	learn: 1.1197798	total: 3.08s	remaining: 5.67s
352:	learn: 1.1196486	total: 3.08s	remaining: 5.65s
353:	learn: 1.1194860	total: 3.09s	remaining: 5.63s
354:	learn: 1.1193267	total: 3.11s	remaining: 5.65s
355:	learn: 1.1191500	total: 3.11s	remaining: 5.63s
356:	learn: 1.1190202	total: 3.12s	remaining: 5.61s
357:	learn: 1.1188779	total: 3.12s	remaining: 5.6s
358:	learn: 1.1187085	total: 3.14s	remaining: 5.61s
359:	learn: 1.1186239	total: 3.14s	remaining: 5.59s
360:	learn: 1.1184769	total: 3.15s	remaining: 5.57s
361:	learn: 1.

510:	learn: 1.1010318	total: 4.42s	remaining: 4.23s
511:	learn: 1.1009349	total: 4.42s	remaining: 4.21s
512:	learn: 1.1007758	total: 4.42s	remaining: 4.2s
513:	learn: 1.1007084	total: 4.43s	remaining: 4.19s
514:	learn: 1.1005545	total: 4.44s	remaining: 4.18s
515:	learn: 1.1004267	total: 4.44s	remaining: 4.16s
516:	learn: 1.1003294	total: 4.45s	remaining: 4.15s
517:	learn: 1.1002534	total: 4.45s	remaining: 4.14s
518:	learn: 1.1001603	total: 4.47s	remaining: 4.14s
519:	learn: 1.1000427	total: 4.47s	remaining: 4.13s
520:	learn: 1.0999589	total: 4.47s	remaining: 4.11s
521:	learn: 1.0999078	total: 4.48s	remaining: 4.1s
522:	learn: 1.0997858	total: 4.48s	remaining: 4.09s
523:	learn: 1.0996475	total: 4.49s	remaining: 4.08s
524:	learn: 1.0995195	total: 4.49s	remaining: 4.06s
525:	learn: 1.0994362	total: 4.51s	remaining: 4.07s
526:	learn: 1.0993205	total: 4.52s	remaining: 4.06s
527:	learn: 1.0992410	total: 4.54s	remaining: 4.06s
528:	learn: 1.0991003	total: 4.56s	remaining: 4.06s
529:	learn: 1.

673:	learn: 1.0843847	total: 5.78s	remaining: 2.79s
674:	learn: 1.0843201	total: 5.78s	remaining: 2.78s
675:	learn: 1.0842216	total: 5.79s	remaining: 2.77s
676:	learn: 1.0841402	total: 5.79s	remaining: 2.76s
677:	learn: 1.0840924	total: 5.8s	remaining: 2.75s
678:	learn: 1.0839835	total: 5.82s	remaining: 2.75s
679:	learn: 1.0839019	total: 5.83s	remaining: 2.74s
680:	learn: 1.0838307	total: 5.83s	remaining: 2.73s
681:	learn: 1.0837782	total: 5.84s	remaining: 2.72s
682:	learn: 1.0837063	total: 5.84s	remaining: 2.71s
683:	learn: 1.0836530	total: 5.84s	remaining: 2.7s
684:	learn: 1.0835648	total: 5.85s	remaining: 2.69s
685:	learn: 1.0835047	total: 5.87s	remaining: 2.69s
686:	learn: 1.0834214	total: 5.88s	remaining: 2.68s
687:	learn: 1.0833638	total: 5.9s	remaining: 2.68s
688:	learn: 1.0832447	total: 5.9s	remaining: 2.66s
689:	learn: 1.0831560	total: 5.91s	remaining: 2.65s
690:	learn: 1.0830872	total: 5.91s	remaining: 2.64s
691:	learn: 1.0830110	total: 5.91s	remaining: 2.63s
692:	learn: 1.08

841:	learn: 1.0711383	total: 7.13s	remaining: 1.34s
842:	learn: 1.0710813	total: 7.14s	remaining: 1.33s
843:	learn: 1.0710203	total: 7.15s	remaining: 1.32s
844:	learn: 1.0709730	total: 7.16s	remaining: 1.31s
845:	learn: 1.0709007	total: 7.16s	remaining: 1.3s
846:	learn: 1.0708147	total: 7.17s	remaining: 1.29s
847:	learn: 1.0707552	total: 7.18s	remaining: 1.29s
848:	learn: 1.0706717	total: 7.2s	remaining: 1.28s
849:	learn: 1.0705888	total: 7.2s	remaining: 1.27s
850:	learn: 1.0705157	total: 7.23s	remaining: 1.26s
851:	learn: 1.0704713	total: 7.23s	remaining: 1.26s
852:	learn: 1.0703906	total: 7.24s	remaining: 1.25s
853:	learn: 1.0703296	total: 7.26s	remaining: 1.24s
854:	learn: 1.0702750	total: 7.26s	remaining: 1.23s
855:	learn: 1.0701929	total: 7.28s	remaining: 1.22s
856:	learn: 1.0701322	total: 7.28s	remaining: 1.21s
857:	learn: 1.0700533	total: 7.29s	remaining: 1.21s
858:	learn: 1.0699919	total: 7.29s	remaining: 1.2s
859:	learn: 1.0698961	total: 7.29s	remaining: 1.19s
860:	learn: 1.06

In [244]:
# Displaying results:
df_svr_result

Unnamed: 0,ACTUAL,PREDICTED
14983,0.000000,0.307611
6964,0.000000,0.384213
16589,0.000000,0.000000
20399,1.386294,0.615508
20385,0.000000,1.550609
...,...,...
17696,0.000000,0.000000
16336,1.791759,0.313866
8215,2.639057,0.637213
22170,0.693147,1.756494


In [246]:
# Calculating the R^2 and SSE scores of the prediction:

print(f"CatBoostRegresso R^2 score : {round(r2_score(df_cbr_result['ACTUAL'], df_cbr_result['PREDICTED']) ,4) }")
print(f"CatBoostRegresso SSE score : {round(np.sqrt(mean_squared_error(df_cbr_result['ACTUAL'], df_cbr_result['PREDICTED'])) ,4) }")

CatBoostRegresso R^2 score : 0.2019
CatBoostRegresso SSE score : 1.1445
