In [77]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import sklearn
from sklearn.model_selection import train_test_split

In [78]:
traindata = pd.read_csv('datasets/upvotes/train_NIR5Yl1.csv')
traindata.head()

Unnamed: 0,ID,Tag,Reputation,Answers,Username,Views,Upvotes
0,52664,a,3942.0,2.0,155623,7855.0,42.0
1,327662,a,26046.0,12.0,21781,55801.0,1175.0
2,468453,c,1358.0,4.0,56177,8067.0,60.0
3,96996,a,264.0,3.0,168793,27064.0,9.0
4,131465,c,4271.0,4.0,112223,13986.0,83.0


In [79]:
testdata = pd.read_csv('datasets/upvotes/test_8i3B3FC.csv')
testdata.head()

Unnamed: 0,ID,Tag,Reputation,Answers,Username,Views
0,366953,a,5645.0,3.0,50652,33200.0
1,71864,c,24511.0,6.0,37685,2730.0
2,141692,i,927.0,1.0,135293,21167.0
3,316833,i,21.0,6.0,166998,18528.0
4,440445,i,4475.0,10.0,53504,57240.0


In [80]:
testdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 141448 entries, 0 to 141447
Data columns (total 6 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   ID          141448 non-null  int64  
 1   Tag         141448 non-null  object 
 2   Reputation  141448 non-null  float64
 3   Answers     141448 non-null  float64
 4   Username    141448 non-null  int64  
 5   Views       141448 non-null  float64
dtypes: float64(3), int64(2), object(1)
memory usage: 6.5+ MB


## Clean Data

In [81]:
testdata =testdata.dropna()
testdata.describe()

Unnamed: 0,ID,Reputation,Answers,Username,Views
count,141448.0,141448.0,141448.0,141448.0,141448.0
mean,235743.073497,7920.927,3.914873,81348.231117,29846.33
std,136269.867118,27910.72,3.57746,49046.098215,80343.74
min,7.0,0.0,0.0,4.0,9.0
25%,117797.0,286.0,2.0,40222.75,2608.0
50%,235830.0,1245.0,3.0,78795.5,8977.0
75%,353616.0,5123.0,5.0,122149.0,26989.25
max,471488.0,1042428.0,73.0,175737.0,5004669.0


## Ordinal Encoder

In [82]:
from sklearn.preprocessing import OrdinalEncoder
ordinal_encoder = OrdinalEncoder()

In [83]:
tag_cat = traindata[['Tag']]
tag_cat_encoded = ordinal_encoder.fit_transform(tag_cat)
tag_cat_encoded[:10]

array([[0.],
       [0.],
       [1.],
       [0.],
       [1.],
       [7.],
       [1.],
       [4.],
       [4.],
       [1.]])

## One-Hot Encoder

In [84]:
from sklearn.preprocessing import OneHotEncoder

cat_encoder = OneHotEncoder()
tag_cat_1hot = cat_encoder.fit_transform(tag_cat)
tag_cat_1hot.toarray()

array([[1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

## Custom Transformer

In [85]:
class custom_transformer:
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        answer_index, views_index = 3,5
        answers_per_view = X[:, answer_index] / X[:, views_index]
        return np.c_[X, answers_per_view]


In [86]:
extra_cols = transform(traindata.values)

In [87]:
training_extra = pd.DataFrame(
    extra_cols,
    columns=list(traindata.columns)+["answers per view"],
    index=traindata.index)
training_extra.head()

Unnamed: 0,ID,Tag,Reputation,Answers,Username,Views,Upvotes,answers per view
0,52664,a,3942.0,2.0,155623,7855.0,42.0,0.000255
1,327662,a,26046.0,12.0,21781,55801.0,1175.0,0.000215
2,468453,c,1358.0,4.0,56177,8067.0,60.0,0.000496
3,96996,a,264.0,3.0,168793,27064.0,9.0,0.000111
4,131465,c,4271.0,4.0,112223,13986.0,83.0,0.000286


## Scale Views from 0 - 1

In [88]:
from sklearn import preprocessing
x = traindata[["Answers","Views"]] #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
x_scaled[:5]

array([[0.02631579, 0.00149989],
       [0.15789474, 0.01066555],
       [0.05263158, 0.00154042],
       [0.03947368, 0.005172  ],
       [0.05263158, 0.00267193]])

In [106]:
data = traindata.drop("Upvotes", axis = 1)
vals = traindata.Upvotes
traindata.head()

Unnamed: 0,ID,Tag,Reputation,Answers,Username,Views,Upvotes
0,52664,a,3942.0,2.0,155623,7855.0,42.0
1,327662,a,26046.0,12.0,21781,55801.0,1175.0
2,468453,c,1358.0,4.0,56177,8067.0,60.0
3,96996,a,264.0,3.0,168793,27064.0,9.0
4,131465,c,4271.0,4.0,112223,13986.0,83.0


## Grid search for optimization

In [107]:
from sklearn.ensemble import GradientBoostingRegressor
gra_reg =  GradientBoostingRegressor()

In [120]:
train, test = train_test_split(traindata, test_size=0.30)
train[:5]

Unnamed: 0,ID,Tag,Reputation,Answers,Username,Views,Upvotes
307650,468144,c,854.0,3.0,83401,1512.0,20.0
121371,11923,c,859.0,4.0,88355,486.0,2.0
128897,250656,p,762.0,1.0,43920,9180.0,34.0
300177,222530,j,5806.0,6.0,55647,45352.0,383.0
94284,97696,j,215.0,5.0,46918,21520.0,18.0


In [122]:
data = train.drop("Upvotes", axis=1) # drop labels for training set
labels = train["Upvotes"].copy()

In [123]:
preped_data = full_pipeline.fit_transform(data)

In [125]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor(n_estimators=100, random_state=42)
forest_reg.fit(preped_data, labels)

RandomForestRegressor(random_state=42)

In [128]:
from sklearn.metrics import mean_squared_error
predictions = forest_reg.predict(preped_data)
forest_mse = mean_squared_error(labels, predictions)
forest_rmse = np.sqrt(forest_mse)
forest_rmse

555.1864541731752

In [135]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    # try 12 (3×4) combinations of hyperparameters
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6]},
    # then try 6 (2×3) combinations with bootstrap set as False
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
]

forest_reg = RandomForestRegressor(random_state=42)

grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)
grid_search.fit(preped_data, labels)

GridSearchCV(cv=5, estimator=RandomForestRegressor(random_state=42),
             param_grid=[{'max_features': [2, 4, 6],
                          'n_estimators': [3, 10, 30]},
                         {'bootstrap': [False], 'max_features': [2, 3, 4],
                          'n_estimators': [3, 10]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [136]:
grid_search.best_params_

{'max_features': 6, 'n_estimators': 10}

## Pipeline for all data processing and Prediction

#### number pipeline

In [None]:
traindata.head()

In [None]:
traindata = traindata.drop("ID", axis = 1)
# drop ID because it provides no real information

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

train_num = traindata.drop(["Tag","Username"], axis=1)
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler()),
])
train_num_tr = num_pipeline.fit_transform(train_num)
train_num_tr[:5]

In [112]:
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import  RandomForestRegressor

num_attribs = list(train_num)
num_attribs.pop
num
cat_attribs = ["Tag"]


full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs)
])

prepared = full_pipeline.fit_transform(traindata)

In [117]:
print(num_attribs)

['Reputation', 'Answers', 'Views']


In [116]:
num_attribs.pop()

'Upvotes'