# read data

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
data=pd.read_csv('data.csv', encoding='latin1')
test=pd.read_csv('test.csv', encoding='latin1')


In [2]:
columns = data.columns
columns

Index(['Unnamed: 0', 'Episode', 'Station', 'Channel Type', 'Season', 'Year',
       'Date', 'Day of week', 'Start_time', 'End_time', 'Length',
       'Name of show', 'Name of episode', 'Genre', 'First time or rerun',
       '# of episode in the season', 'Movie?',
       'Game of the Canadiens during episode?', 'Market Share_total',
       'Temperature in Montreal during episode'],
      dtype='object')

# Delete some feature

In [3]:
data.drop(['Unnamed: 0', 'Episode','Year', 'Date', 
           'Temperature in Montreal during episode','Name of show', 
           'Name of episode','Start_time', 'End_time',], axis = 1, inplace = True)
test.drop(['Unnamed: 0', 'Episode','Year', 'Date', 
           'Temperature in Montreal during episode','Name of show', 
           'Name of episode','Start_time', 'End_time',], axis = 1, inplace = True)


In [4]:
columns = data.columns
print("#of Features: {} \nFeatures: \n{}".format( len(columns), columns))

#of Features: 11 
Features: 
Index(['Station', 'Channel Type', 'Season', 'Day of week', 'Length', 'Genre',
       'First time or rerun', '# of episode in the season', 'Movie?',
       'Game of the Canadiens during episode?', 'Market Share_total'],
      dtype='object')


# convert categorical to numerical

In [25]:
def make_keys(lsts):
    return {value:key for key, value in enumerate(lsts,1)}

In [26]:
binary_feature = {"Yes":1, "No":0}
season_feature = make_keys(data['Season'].unique()) 
channel_type_feature = { 'General Channel': 1,'Specialty Channel':0}
day_of_week_feature = make_keys(data['Day of week'].unique())
genre_feature = make_keys(data['Genre'].unique())
station_feature = make_keys(data['Station'].unique())

In [27]:
data['Movie?'] = data['Movie?'].map(binary_feature)
data['Channel Type'] = data['Channel Type'].map(channel_type_feature)
data['Season'] = data['Season'].map(season_feature)
data['Day of week'] = data['Day of week'].map(day_of_week_feature)
data['First time or rerun'] = data['First time or rerun'].map(binary_feature)
data['# of episode in the season'] = data['# of episode in the season'].map(binary_feature)
data['Game of the Canadiens during episode?'] = data['Game of the Canadiens during episode?'].map(binary_feature)
data['Station'] = data['Station'].map(station_feature)
data['Genre'] = data['Genre'].map(genre_feature)
#
test['Movie?'] = test['Movie?'].map(binary_feature)
test['Channel Type'] = test['Channel Type'].map(channel_type_feature)
test['Season'] = test['Season'].map(season_feature)
test['Day of week'] = test['Day of week'].map(day_of_week_feature)
test['First time or rerun'] = test['First time or rerun'].map(binary_feature)
test['# of episode in the season'] = test['# of episode in the season'].map(binary_feature)
test['Game of the Canadiens during episode?'] = test['Game of the Canadiens during episode?'].map(binary_feature)
test['Station'] = test['Station'].map(station_feature)
test['Genre'] = test['Genre'].map(genre_feature)

In [28]:
data.head(3)

Unnamed: 0,Station,Channel Type,Season,Day of week,Length,Genre,First time or rerun,# of episode in the season,Movie?,Game of the Canadiens during episode?,Market Share_total
0,1,1,1,1,8,1,0,1,0,0,0.9
1,1,1,1,1,2,2,0,1,0,0,0.5
2,1,1,1,1,2,3,0,1,0,0,0.3


# Train/test split 

In [29]:
y = data["Market Share_total"]
x = data[data.columns[:-1]]

In [30]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [31]:
print("Train Size: ", len(x_train))
print("Test Size: ", len(x_test))

Train Size:  431659
Test Size:  184997


# DecisionTreeRegressor

In [42]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score,mean_squared_error
estimator = DecisionTreeRegressor(max_depth = 100)

In [43]:
estimator.fit(x_train,y_train)

DecisionTreeRegressor(criterion='mse', max_depth=100, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

In [44]:
predict = estimator.predict(x_test)

***R2***

In [45]:
r2_score(y_test, predict)

0.7514762704520279

***MSE***

In [46]:
mean_squared_error(y_test, predict)

5.952461865668787

In [47]:
test_prediction=estimator.predict(test)

# LinearRegression

In [48]:
from sklearn.linear_model import LinearRegression 
estimator = LinearRegression()

In [49]:
estimator.fit(x_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [50]:
predict = estimator.predict(x_test)

In [51]:
r2_score(y_test, predict)

0.33714869338647446

In [52]:
mean_squared_error(y_test, predict)

15.87613839693376

# Deep learning

In [54]:

from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler(feature_range = (0, 1))
x = min_max_scaler.fit_transform(x)
test=min_max_scaler.fit_transform(test)

***Train test split***

In [55]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.05, random_state=101)
x_train, x_test, y_train, y_test = np.array(x_train), np.array(x_test), np.array(y_train), np.array(y_test)
TEST_END=np.array(test)

***fully connected***

In [57]:
from keras.models import Sequential
from keras.layers import Dense, Activation
import tensorflow as tf

model = Sequential()
model.add(Dense(1024, activation='relu'))
model.add(Dense(512, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(1, activation='linear'))

model.compile(optimizer='adam',loss='mae')
model.fit(x=x_train,
              y=y_train,
              batch_size=10000,
              epochs=10,
              validation_data=(x_test,y_test))


Train on 585823 samples, validate on 30833 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f623f8c66d8>

***Prediction***

In [17]:
y_pred = model.predict(x_test)
y_TEST=model.predict(TEST_END)

In [58]:
from sklearn.metrics import r2_score
print("R2:",r2_score(y_test, y_pred))
print("MSE:",mean_squared_error(y_test, y_pred))

R2: 0.6322306203693159
MSE: 8.720691227444023
