## Alexandria Glover- Project 3 Upvotes Dataset 

In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv('test_8i3B3FC.csv')
data.dtypes

ID              int64
Tag            object
Reputation    float64
Answers       float64
Username        int64
Views         float64
dtype: object

In [None]:
#Shows no missing values so the data is clean.*Clean the Data & Deal with the Missing Values*
data.isnull().sum()

ID            0
Tag           0
Reputation    0
Answers       0
Username      0
Views         0
dtype: int64

In [None]:
data.head()

Unnamed: 0,ID,Tag,Reputation,Answers,Username,Views
0,366953,a,5645.0,3.0,50652,33200.0
1,71864,c,24511.0,6.0,37685,2730.0
2,141692,i,927.0,1.0,135293,21167.0
3,316833,i,21.0,6.0,166998,18528.0
4,440445,i,4475.0,10.0,53504,57240.0


In [None]:
from sklearn.model_selection import train_test_split
X = data.drop('Views',axis=1)
y = data['Views']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [None]:
## Create the single pipeline that does the full process

from sklearn.preprocessing import StandardScaler, OrdinalEncoder,OneHotEncoder,MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [None]:
#*Use a custom Trasformer and assign ordinal encoder*
numeric_transformer = Pipeline(steps=[
       ('imputer', SimpleImputer(strategy='mean'))
      ,('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
       ('imputer', SimpleImputer(strategy='constant'))
      ,('encoder', OrdinalEncoder())
])

In [None]:
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

## Sklearn Preprocessor

In [None]:
preprocessor = ColumnTransformer(
   transformers=[
    ('numeric', numeric_transformer, numeric_features)
   ,('categorical', categorical_transformer, categorical_features)
]) 

## Estimator & Decision Tree Pipeline

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
pipeline = Pipeline(steps = [
               ('preprocessor', preprocessor)
              ,('regressor',DecisionTreeRegressor())
           ])

In [None]:
dt_model = pipeline.fit(X_train, y_train)
print (dt_model)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  Index(['ID', 'Reputation', 'Answers', 'Username'], dtype='object')),
                                                 ('categorical',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='constant')),
                                                                  ('encoder',
                                                                   OrdinalEncoder())]),
                                              

In [None]:
from sklearn.metrics import r2_score
predictions = dt_model.predict(X_test)
print (r2_score(y_test, predictions))

-0.3679832958651248


## Preprocessing
### Simple imputer(median)/ StandardScaler/ OneHotEncoder

In [None]:
#Implement OneHotEncoder
numeric_transformer = Pipeline(steps=[
       ('imputer', SimpleImputer(strategy='median'))
      ,('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
       ('imputer', SimpleImputer(strategy='constant'))
      ,('encoder', OneHotEncoder())
])
pipeline = Pipeline(steps = [
               ('preprocessor', preprocessor)
              ,('regressor',DecisionTreeRegressor())
           ])

In [None]:
dt_model = pipeline.fit(X_train, y_train)
print (dt_model)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  Index(['ID', 'Reputation', 'Answers', 'Username'], dtype='object')),
                                                 ('categorical',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='constant')),
                                                                  ('encoder',
                                                                   OrdinalEncoder())]),
                                              

In [None]:
#*Other Sklearn metrics option to evaluate model performance
from sklearn.metrics import r2_score
predictions = dt_model.predict(X_test)
print (mean_squared_error(y_test, predictions))
print(mean_absolute_error(y_test, predictions))

9503111441.662071
35206.90236832803


## Cross Validation Score

In [None]:
 from sklearn.model_selection import cross_val_score

In [None]:
#KFold/StratifiedKFold cross validation with 3 folds
#applying the classifier pipeline to the feature and target data
from sklearn.model_selection import StratifiedKFold
cv = StratifiedKFold(n_splits=3)
score = cross_val_score(dt_model, X_train, y_train, cv=cv)
print("Mean validation score: {0:.3f} (std: {1:.5f})".format(np.mean(score),np.std(score)))

Mean validation score: -0.369 (std: 0.08453)


In [None]:
predictions = dt_model.predict(X_test)
print ("Mean Square Error(DecisionTreeRegressor): ",mean_squared_error(y_test, predictions))
print("Mean Absolute Error(DecisionTreeRegressor):",mean_absolute_error(y_test, predictions))


Mean Square Error(DecisionTreeRegressor):  9503111441.662071
Mean Absolute Error(DecisionTreeRegressor): 35206.90236832803


In [None]:
##Linear Regression Pipeline
from sklearn.linear_model import LinearRegression

In [None]:
pipeline = Pipeline(steps = [
               ('preprocessor', preprocessor)
              ,('regressor',LinearRegression())
           ])

In [None]:
lr_model = pipeline.fit(X_train, y_train)
print (lr_model)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  Index(['ID', 'Reputation', 'Answers', 'Username'], dtype='object')),
                                                 ('categorical',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='constant')),
                                                                  ('encoder',
                                                                   OrdinalEncoder())]),
                                              

In [None]:
#KFold/StratifiedKFold cross validation with 3 folds
#applying the classifier pipeline to the feature and target data
from sklearn.model_selection import StratifiedKFold
cv = StratifiedKFold(n_splits=3)
score = cross_val_score(lr_model, X_train, y_train, cv=cv)
print("Mean validation score: {0:.3f} (std: {1:.5f})".format(np.mean(score),np.std(score)))

Mean validation score: 0.266 (std: 0.00973)


In [None]:
score

array([0.25841275, 0.2801378 , 0.26077191])

In [None]:
predictions = lr_model.predict(X_test)
print ("Mean Square Error(Linear Regression): ",mean_squared_error(y_test, predictions))
print("Mean Absolute Error(Linear Regression):",mean_absolute_error(y_test, predictions))

Mean Square Error(Linear Regression):  5250686727.967266
Mean Absolute Error(Linear Regression): 28484.495992330267
