In [9]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import sklearn

In [10]:
traindata = pd.read_csv('datasets/upvotes/train_NIR5Yl1.csv')
traindata.head()

Unnamed: 0,ID,Tag,Reputation,Answers,Username,Views,Upvotes
0,52664,a,3942.0,2.0,155623,7855.0,42.0
1,327662,a,26046.0,12.0,21781,55801.0,1175.0
2,468453,c,1358.0,4.0,56177,8067.0,60.0
3,96996,a,264.0,3.0,168793,27064.0,9.0
4,131465,c,4271.0,4.0,112223,13986.0,83.0


In [11]:
testdata = pd.read_csv('datasets/upvotes/test_8i3B3FC.csv')
testdata.head()

Unnamed: 0,ID,Tag,Reputation,Answers,Username,Views
0,366953,a,5645.0,3.0,50652,33200.0
1,71864,c,24511.0,6.0,37685,2730.0
2,141692,i,927.0,1.0,135293,21167.0
3,316833,i,21.0,6.0,166998,18528.0
4,440445,i,4475.0,10.0,53504,57240.0


In [12]:
testdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 141448 entries, 0 to 141447
Data columns (total 6 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   ID          141448 non-null  int64  
 1   Tag         141448 non-null  object 
 2   Reputation  141448 non-null  float64
 3   Answers     141448 non-null  float64
 4   Username    141448 non-null  int64  
 5   Views       141448 non-null  float64
dtypes: float64(3), int64(2), object(1)
memory usage: 6.5+ MB


## Clean Data

In [13]:
testdata =testdata.dropna()
testdata.describe()

Unnamed: 0,ID,Reputation,Answers,Username,Views
count,141448.0,141448.0,141448.0,141448.0,141448.0
mean,235743.073497,7920.927,3.914873,81348.231117,29846.33
std,136269.867118,27910.72,3.57746,49046.098215,80343.74
min,7.0,0.0,0.0,4.0,9.0
25%,117797.0,286.0,2.0,40222.75,2608.0
50%,235830.0,1245.0,3.0,78795.5,8977.0
75%,353616.0,5123.0,5.0,122149.0,26989.25
max,471488.0,1042428.0,73.0,175737.0,5004669.0


## Ordinal Encoder

In [14]:
from sklearn.preprocessing import OrdinalEncoder
ordinal_encoder = OrdinalEncoder()

In [15]:
tag_cat = traindata[['Tag']]
tag_cat_encoded = ordinal_encoder.fit_transform(tag_cat)
tag_cat_encoded[:10]

array([[0.],
       [0.],
       [1.],
       [0.],
       [1.],
       [7.],
       [1.],
       [4.],
       [4.],
       [1.]])

## One-Hot Encoder

In [16]:
from sklearn.preprocessing import OneHotEncoder

cat_encoder = OneHotEncoder()
tag_cat_1hot = cat_encoder.fit_transform(tag_cat)
tag_cat_1hot.toarray()

array([[1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

## Custom Transformer

In [17]:
def transform(X):
    answer_index, views_index = 3,5
    answers_per_view = X[:, answer_index] / X[:, views_index]
    return np.c_[X, answers_per_view]


In [18]:
extra_cols = transform(traindata.values)

In [19]:
training_extra = pd.DataFrame(
    extra_cols,
    columns=list(traindata.columns)+["answers per view"],
    index=traindata.index)
training_extra.head()

Unnamed: 0,ID,Tag,Reputation,Answers,Username,Views,Upvotes,answers per view
0,52664,a,3942.0,2.0,155623,7855.0,42.0,0.000255
1,327662,a,26046.0,12.0,21781,55801.0,1175.0,0.000215
2,468453,c,1358.0,4.0,56177,8067.0,60.0,0.000496
3,96996,a,264.0,3.0,168793,27064.0,9.0,0.000111
4,131465,c,4271.0,4.0,112223,13986.0,83.0,0.000286
