In [1]:
#Load the Data

import pandas as pd

data = pd.read_csv('data.csv',encoding='latin1')

In [10]:
#Function for converting Categorical Features to Numerical

def convertCategoricalToNumerical(inputData, NameOfColumn, column):
    uniq_itemOfColumn = set(list(inputData[NameOfColumn]))
    label_index = dict((c, i) for i, c in enumerate(uniq_itemOfColumn))
    #print(len(label_index))
    Y = []
    for i in column:
        Y.append(label_index[i])
        
    return Y

In [11]:
#Preprocess Data: Choose some Features and convert them to numerical Features

columns = ['Episode', 'Station', 'Channel Type', 'Season', 'Year','Day of week',
           'Name of show','Genre','First time or rerun','# of episode in the season', 
           'Movie?','Game of the Canadiens during episode?']

convertedX = pd.DataFrame(columns=columns)

for c in columns:
    convertedX[c] = convertCategoricalToNumerical(data, c, data[c])

convertedX['Length'] = data['Length']
convertedX['MarketShare_total'] = data['MarketShare_total']

# Delete NAN records
convertedX = convertedX.dropna()

In [4]:
#Make Train and Test from the Data

import numpy as np

msk = np.random.rand(len(convertedX)) < 0.8
train = convertedX[msk]
test = convertedX[~msk]

train_y = train.MarketShare_total
train = train.drop('MarketShare_total', axis=1)

test_y = test.MarketShare_total
test = test.drop('MarketShare_total', axis=1)

In [5]:
# Linear Regression Model

from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
   
regr = linear_model.LinearRegression() 
regr.fit(train, train_y)

predicted = regr.predict(test)

print('MSE:', mean_squared_error(test_y, predicted))
print('MAE:', mean_absolute_error(test_y, predicted))

MSE: 16.310708600670885
MAE: 2.3786746679408073


In [6]:
# Decision Tree Model for Regression

from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

regressor = DecisionTreeRegressor(criterion='mse', max_depth=20)
regressor.fit(train, train_y)

regPred = regressor.predict(test)

print('MSE:', mean_squared_error(test_y, regPred))
print('MAE:', mean_absolute_error(test_y, regPred))

MSE: 3.900934414957479
MAE: 1.167668321548421


In [7]:
# Random Forest Model for Regression

from sklearn.ensemble import RandomForestRegressor 
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

regressor = RandomForestRegressor(n_estimators = 100, random_state = 0) 
regressor.fit(train, train_y)  

y_pred = regressor.predict(test)

print('MSE:', mean_squared_error(test_y, y_pred))
print('MAE:', mean_absolute_error(test_y, y_pred))

MSE: 3.5666570755689393
MAE: 1.1209354290126314


In [8]:
# Gradient Boosting Model

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import GradientBoostingRegressor

est = GradientBoostingRegressor(n_estimators=100, learning_rate=0.2,
                                max_depth=5, random_state=0, loss='ls')
est.fit(train, train_y)

Gpred = est.predict(test)

print('MSE:', mean_squared_error(test_y, Gpred))
print('MAE:', mean_absolute_error(test_y, Gpred))

MSE: 4.851736390771469
MAE: 1.355707896097597


In [None]:
# SVM for Regression 

import numpy as np
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.1)
y_rbf = svr_rbf.fit(train, train_y).predict(test)

print('MSE:', mean_squared_error(test_y, y_rbf))
print('MAE:', mean_absolute_error(test_y, y_rbf))

In [13]:
# Load Test Data and Preprocess, then Predict with Random Forest Model

TaskTest = pd.read_csv('test.csv',encoding='latin1')

convertedTest = pd.DataFrame(columns=columns)
for cc in columns:
    convertedTest[cc] = convertCategoricalToNumerical(TaskTest, cc, TaskTest[cc])

convertedTest['Length'] = TaskTest['Length']
convertedTest = convertedTest.dropna()

test_Prediction = regressor.predict(convertedTest)
print(test_Prediction)

[0.97808865 1.60881837 1.24152846 ... 1.9601975  4.80771124 3.86916772]
