# KEN3450 COMPETITION


In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt

## Read the data
- drop the id column for training data, but remember it for the test data
- extract the label (house price) and transform it to log space
- join the training and testing data into a single dataframe for easier transformation with pandas


In [2]:
# read the data 
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# get the training and testing data
x_train_raw = train_data.drop(['shares'],axis=1)
x_test_raw = test_data

# remember the lenghts so we can split the data later
train_length = len(train_data)
test_length = len(test_data)

# concatenate train + test data into one df
all_data = pd.concat([x_train_raw, x_test_raw])

# get the training data - house prices
# normal prices have a skew so adjust them to log space
y_train = train_data['shares']


#display the data
pd.set_option('display.max_columns', 500)
x_test_raw.head(2)


Unnamed: 0,n_tokens_title,n_tokens_content,n_unique_tokens,n_non_stop_words,n_non_stop_unique_tokens,num_hrefs,num_self_hrefs,num_imgs,num_videos,average_token_length,num_keywords,data_channel_is_lifestyle,data_channel_is_entertainment,data_channel_is_bus,data_channel_is_socmed,data_channel_is_tech,data_channel_is_world,kw_min_avg,kw_max_avg,kw_avg_avg,self_reference_min_shares,self_reference_max_shares,self_reference_avg_sharess,weekday_is_monday,weekday_is_tuesday,weekday_is_wednesday,weekday_is_thursday,weekday_is_friday,weekday_is_saturday,weekday_is_sunday,is_weekend,topic_01,topic_02,topic_03,topic_04,topic_05,global_subjectivity,global_sentiment_polarity,global_rate_positive_words,global_rate_negative_words,rate_positive_words,rate_negative_words,title_subjectivity,title_sentiment_polarity
0,9,531,0.503788,1.0,0.665635,9,0,1,0,4.404896,7,0,1,0,0,0,0,0.0,0.0,0.0,0.0,0,0.0,1,0,0,0,0,0,0,0,0.028573,0.4193,0.494651,0.028905,0.028572,0.42985,0.100705,0.041431,0.020716,0.666667,0.333333,0.0,0.0
1,10,370,0.559889,1.0,0.698198,2,2,0,0,4.359459,9,0,0,0,0,1,0,0.0,0.0,0.0,8500.0,8500,8500.0,1,0,0,0,0,0,0,0,0.022245,0.306718,0.022231,0.022224,0.626582,0.437409,0.071184,0.02973,0.027027,0.52381,0.47619,0.642857,0.214286


### Transforming features
- missing -> replace with median 
- categorical -> One hot encoding
- numerical -> use Robust scaler that is robust to outliers


In [3]:
# a nice trick to find out numeric vs categorical features
numerical_features = x_train_raw.columns[x_train_raw.dtypes != 'object']
categorical_features = x_train_raw.columns[x_train_raw.dtypes == 'object']

# encode missing numbers as a special large number
all_data[numerical_features] = all_data[numerical_features].fillna(all_data.median())

# encode missing data as a special category -> missing
all_data[categorical_features] = all_data[categorical_features].fillna("Missing")

all_data.head(2)

# transform numeric variables 
ss = RobustScaler() # StandardScaler()
#all_data[numerical_features] = ss.fit_transform(all_data[numerical_features])

# transform categorical variables
all_data = pd.get_dummies(data=all_data, columns=categorical_features)

### Re-split the data back to original proportions¶


In [4]:
# re-split again
x_train = all_data.head(train_length)
x_test = all_data.tail(test_length)

print("Raw features vs features in transformed model:", x_train_raw.shape[1] ,'vs', x_train.shape[1], 'features')
print("Length train vs test:" , train_length , ' vs ', test_length)

Raw features vs features in transformed model: 44 vs 44 features
Length train vs test: 29733  vs  9911


## XGBoost regressor


In [5]:
Xgbr = GradientBoostingRegressor(n_estimators=5000, learning_rate=0.01, max_depth=7, 
                                max_features='sqrt',min_samples_leaf=15, min_samples_split=10, 
                                loss='ls',random_state = 13)
# est.fit(x_train, y_train)

lasso =  Lasso(alpha =0.0005, random_state=1)
ENet =  ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3)

class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models):
        self.models = models
        
    # we define clones of the original models to fit the data in
    def fit(self, X, y):
        self.models_ = [clone(x) for x in self.models]
        
        # Train cloned base models
        for model in self.models_:
            model.fit(X, y)

        return self
    
    #Now we do the predictions for cloned models and average them
    def predict(self, X):
        predictions = np.column_stack([
            model.predict(X) for model in self.models_
        ])
        return np.mean(predictions, axis=1)   

averaged_models = AveragingModels(models = (Xgbr, lasso,ENet))


In [6]:
best_model = Xgbr.fit(x_train, y_train)

### Write the submission

In [7]:
y_test = best_model.predict(x_test)
result = pd.DataFrame({'shares': y_test.ravel()})
result['id'] = result.index + 1

result = result[['id','shares']]

result.to_csv('submission.csv',index=False)


In [8]:
result

Unnamed: 0,id,shares
0,1,1516.482920
1,2,4113.982565
2,3,1424.861534
3,4,1611.413181
4,5,3357.600822
5,6,2422.796938
6,7,1423.461302
7,8,1650.528421
8,9,1071.823094
9,10,2122.705865


## 