In [56]:
# import packages

import numpy as np
import pandas as pd
from pandas import value_counts
from pandas.api.types import is_numeric_dtype
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler, MinMaxScaler, Normalizer, RobustScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

import warnings

import category_encoders as ce 

from xgboost import XGBRegressor
import random

from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.linear_model import Lasso, Ridge, LogisticRegression, LinearRegression

In [57]:
train = pd.read_csv('C2T2_Train.csv')
test = pd.read_csv('C2T2_Test.csv')

In [58]:
train['PageCategory'] = train['PageCategory'].astype('str')
test['PageCategory'] = test['PageCategory'].astype('str')

In [59]:
del train['ID']
del test['ID']

In [60]:
# find all columns with too many values belongs to one categoric class. (>99%)
del_list = []
for col in train.columns.tolist():
    if train[col].value_counts(ascending = False).reset_index()[col][0]/len(train) > 0.99:
        del_list +=[col]
del_list

['PageCC1Min', 'PageCC2Min', 'PageCC3Min', 'PageCC4Min', 'PostPromoted']

In [61]:
train.drop(del_list, axis = 1, inplace = True)
test.drop(del_list, axis = 1, inplace = True)

In [62]:
#For each pairs of large correlations, delete one column.
del_col = ['PageCC5Min','PageCC5Max', 'PageCC2Avg','PageCC3Avg','PageCC4Std','PageCC4Max','PageCC4Median','TotalComments_CC1','PageCC5Std','PageCC3Max','PageCC4Avg','PageCC1Std','PageCC1Median','PageCC2Max','PageCC3Std']
train.drop(del_col, axis = 1, inplace = True)
test.drop(del_col, axis = 1, inplace = True)

In [63]:
train.drop(['PageCheckIns','PostLength'], axis = 1, inplace = True)
test.drop(['PageCheckIns','PostLength'], axis = 1, inplace = True)

In [64]:
# HashingEncoder for PageCategory. Similar as onehog encoding, hashing encoder has less dimension and some information loss.
encoder=ce.HashingEncoder(cols='PageCategory',n_components=6)
encoder.fit(train)
train = encoder.transform(train)
test = encoder.transform(test)

In [65]:
# use min-max scaler and normalization for all columns
target = 'CommentsNumber'
col = train.columns
col_transform = col[~col.isin([target])]
scaler = RobustScaler()
scaler.fit(train[col_transform])
train[col_transform] = scaler.transform(train[col_transform])
test[col_transform] = scaler.transform(test[col_transform])

# normalizer make it worse
norm = Normalizer()
norm.fit(train[col_transform])
train[col_transform] = norm.transform(train[col_transform])
test[col_transform] = norm.transform(test[col_transform])

In [66]:
# baseline model
base_data = pd.read_csv('C2T2_Train.csv')
lm = LinearRegression()
cross_score = cross_val_score(lm, base_data.iloc[:,:-1], base_data[target], cv=5, scoring = 'neg_root_mean_squared_error', n_jobs = -1)
print(-1 * cross_score)
print(-sum(cross_score)/5)

[3.53316627e+01 2.18873425e+01 2.81505894e+01 7.69721829e+04
 2.53914295e+01]
15416.588794054842


In [67]:
lm = LinearRegression()
cross_score = cross_val_score(lm, train.iloc[:,:-1], train[target], cv=5, scoring = 'neg_root_mean_squared_error', n_jobs = -1)
print(-1 * cross_score)
print(-sum(cross_score)/5)

[35.41758393 22.35001525 28.30207199 24.85202193 25.91068178]
27.366474977405336


In [68]:
randf = RandomForestRegressor(random_state=0)
cross_score = cross_val_score(randf, train.iloc[:,:-1], train[target], cv=5, scoring = 'neg_root_mean_squared_error', n_jobs = -1)
print(-1 * cross_score)
print(-sum(cross_score)/5)

[25.75819223 14.02145886 21.0514033  16.59485918 15.71809703]
18.628802121688846
