<h1>REGRESSION ANALYSIS</h1>

<h2>Import libraries</h2>

In [1]:
import pandas as pd
import numpy as np
import unicodedata
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV

<h2>Load data</h2>

In [2]:
df = pd.read_csv('siivottu_leffadata.csv')
df[['Katsojat', 'Tulot']] = df[['Katsojat', 'Tulot']].astype(float)
df['Ensi'] = pd.to_datetime(df['Ensi'], format='%Y-%m-%d')
df['Pvm'] = pd.to_datetime(df['Pvm'], format='%Y-%m-%d')
df['Genre'] = df['Genre'].apply(eval)
df['SeasonInt'] = df['Season'].astype('category').cat.codes+1
df = df.drop(columns = ['Suom', 'Nimi', 'Ensi', 'Tuotantomaa', 'IMDb ID ja linkki', 'Pvm'])
df


Unnamed: 0,Katsojat,Tulot,Genre,Season,SeasonInt
0,1114954.0,7728955.71,"[Drama, Romance]",,0
1,1023514.0,14190080.11,"[Drama, War]",HL,4
2,821805.0,52911.18,[Drama],,0
3,759301.0,12076.74,"[Biography, Crime, Drama]",,0
4,750965.0,,[Comedy],,0
...,...,...,...,...,...
11752,3.0,,[Thriller],,0
11753,3.0,,"[Drama, War]",,0
11754,3.0,,"[Crime, Drama, Thriller]",,0
11755,3.0,,[Drama],,0


First, let's manipulate the 'Genre' column into boolean columns and categorize encode the 'Season' column.

In [3]:
genres = ['Action', 'Adult', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'Film-Noir', 'History', 'Horror', 'Music', 'Musical', 'Mystery', 'News', 'Romance', 'Sci-Fi', 'Short', 'Sport', 'Thriller', 'War','Western']

bool_dict = {}
for i in genres:
    bool_dict[i] = df['Genre'].apply(lambda x: 1 if i in x else 0)
bool_df = pd.DataFrame(bool_dict)
df = pd.merge(df, bool_df, left_index = True, right_index=True, suffixes=('',''))
df = df.drop(columns=['Genre'])
df

Unnamed: 0,Katsojat,Tulot,Season,SeasonInt,Action,Adult,Adventure,Animation,Biography,Comedy,...,Musical,Mystery,News,Romance,Sci-Fi,Short,Sport,Thriller,War,Western
0,1114954.0,7728955.71,,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,1023514.0,14190080.11,HL,4,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,821805.0,52911.18,,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,759301.0,12076.74,,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,750965.0,,,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11752,3.0,,,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
11753,3.0,,,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
11754,3.0,,,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
11755,3.0,,,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Split data into train and test sets and how choice of genres affects profits

In [4]:
df1 = df.dropna(subset=['Tulot'])
df1 = df1.drop(columns=['Season', 'SeasonInt', 'Katsojat'])
X_train, X_test, y_train, y_test = train_test_split(df1.drop(columns=['Tulot']), df1['Tulot'], test_size=0.20)

# Lasso with 5 fold cross-validation
model = LassoCV(cv=5, random_state=0, max_iter=10000)

# Fit model
model.fit(X_train, y_train)
LassoCV(cv=5, max_iter=10000, random_state=0)

# Set best alpha
lasso_best = Lasso(alpha=model.alpha_)
lasso_best.fit(X_train, y_train)

sorted(list(zip(lasso_best.coef_, df1)), key=lambda tup: tup[0], reverse=True)

[(245166.00442020863, 'Adult'),
 (108301.11381841158, 'Music'),
 (85814.30559199478, 'Tulot'),
 (71897.78040408113, 'Animation'),
 (52700.986036188784, 'Family'),
 (49651.22610989928, 'Thriller'),
 (45136.657011247975, 'Adventure'),
 (30497.760236714737, 'Romance'),
 (25915.87131560791, 'Musical'),
 (24803.99581141503, 'Drama'),
 (11835.719751376748, 'Horror'),
 (4037.326029801796, 'Sport'),
 (-0.0, 'Biography'),
 (0.0, 'Mystery'),
 (-28885.558244416512, 'News'),
 (-36157.89762003837, 'Short'),
 (-42583.15261029809, 'Film-Noir'),
 (-47324.87334210973, 'Comedy'),
 (-98180.85024223589, 'Documentary'),
 (-126119.3045073779, 'War'),
 (-132629.77487841624, 'History'),
 (-137367.81395679482, 'Fantasy'),
 (-188995.64105379497, 'Sci-Fi'),
 (-210110.8215373854, 'Crime'),
 (-223102.7968658575, 'Action')]

Split data into train and test sets and see how choice of genres affect viewership

In [5]:
df2 = df.dropna(subset=['Katsojat'])
df2 = df2.drop(columns=['Season', 'SeasonInt', 'Tulot'])
X_train2, X_test2, y_train2, y_test2 = train_test_split(df2.drop(columns=['Katsojat']), df2['Katsojat'], test_size=0.20)

# Fit model
model.fit(X_train2, y_train2)

lasso_best2 = Lasso(alpha=model.alpha_)
lasso_best2.fit(X_train2, y_train2)

sorted(list(zip(lasso_best2.coef_, df2)), key=lambda tup: tup[0], reverse=True)

[(21903.870299289545, 'Adult'),
 (12439.775960551673, 'Katsojat'),
 (9781.922090520811, 'Music'),
 (9311.439274779039, 'Animation'),
 (8932.234657743165, 'Adventure'),
 (6694.801781353999, 'Drama'),
 (5267.613475010911, 'Family'),
 (3422.2302213508406, 'Sport'),
 (3101.877905677394, 'Biography'),
 (1882.1513413828316, 'Thriller'),
 (1540.6724859024039, 'Musical'),
 (1245.4568941997325, 'Horror'),
 (-0.0, 'Mystery'),
 (-287.72970194597696, 'Romance'),
 (-1331.8275562215497, 'News'),
 (-2092.2299223307164, 'Short'),
 (-3608.490032682924, 'Film-Noir'),
 (-5502.114450960331, 'Comedy'),
 (-7789.5015366933385, 'Documentary'),
 (-11741.24941710849, 'Fantasy'),
 (-12743.874712322568, 'History'),
 (-13436.646288736027, 'War'),
 (-15665.752870237233, 'Action'),
 (-22103.04655468799, 'Crime'),
 (-26056.79193139961, 'Sci-Fi')]

First, let's manipulate the 'Season' column into boolean columns.

In [14]:
dfs = df.dropna(subset=['Season'])
dfs = dfs[['Katsojat', 'Tulot', 'Season']]
seasons = ['NY', 'VD', 'MD', 'MS', 'BS', 'HL', 'BF', 'ID', 'CH']


bool_dict2 = {}
for i in seasons:
    bool_dict2[i] = dfs['Season'].apply(lambda x: 1 if i == x else 0)
bool_df2 = pd.DataFrame(bool_dict2)
dfs = pd.merge(dfs, bool_df2, left_index = True, right_index=True, suffixes=('',''))
dfs = dfs.drop(columns=['Season'])
dfs


Unnamed: 0,Katsojat,Tulot,NY,VD,MD,MS,BS,HL,BF,ID,CH
1,1023514.0,14190080.11,0,0,0,0,0,1,0,0,0
5,716495.0,2281870.69,0,0,0,0,0,0,0,0,1
6,715297.0,252773.46,0,0,0,0,0,0,0,0,1
7,711909.0,557.54,0,0,0,0,0,1,0,0,0
8,696682.0,8328641.17,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
11737,9.0,,0,0,0,0,0,1,0,0,0
11738,8.0,,0,0,1,0,0,0,0,0,0
11741,7.0,,1,0,0,0,0,0,0,0,0
11748,6.0,44.00,1,0,0,0,0,0,0,0,0


Split data into train and test sets and see how time of season affects viewership

In [16]:
df3 = dfs.dropna(subset=['Katsojat'])
df3 = df3.drop(columns=['Tulot'])
X_train3, X_test3, y_train3, y_test3 = train_test_split(df3.drop(columns=['Katsojat']), df3['Katsojat'], test_size=0.20)

# Fit model
model.fit(X_train3, y_train3)

lasso_best3 = Lasso(alpha=model.alpha_)
lasso_best3.fit(X_train3, y_train3)

sorted(list(zip(lasso_best3.coef_, df3)), key=lambda tup: tup[0], reverse=True)

[(31961.916622551464, 'ID'),
 (6455.024664351589, 'NY'),
 (4462.734969270959, 'BS'),
 (0.0, 'MS'),
 (-0.0, 'HL'),
 (-0.0, 'BF'),
 (-1465.4688779072783, 'Katsojat'),
 (-3740.3949034311813, 'MD'),
 (-10126.456696036385, 'VD')]

Split data into train and test sets and see how time of season affects profits

In [18]:
df4 = dfs.dropna(subset=['Tulot'])
df4 = df4.drop(columns=['Katsojat'])
X_train4, X_test4, y_train4, y_test4 = train_test_split(df4.drop(columns=['Tulot']), df4['Tulot'], test_size=0.20)

# Fit model
model.fit(X_train4, y_train4)

lasso_best4 = Lasso(alpha=model.alpha_)
lasso_best4.fit(X_train4, y_train4)

sorted(list(zip(lasso_best4.coef_, df4)), key=lambda tup: tup[0], reverse=True)

[(80815.09137439777, 'ID'),
 (68147.5887660205, 'NY'),
 (8595.38568837232, 'MS'),
 (-0.0, 'Tulot'),
 (-0.0, 'MD'),
 (-0.0, 'BS'),
 (-0.0, 'HL'),
 (0.0, 'BF'),
 (-87762.42109859937, 'VD')]