Hello! This is some sample code for our 588 project! I begin by looking at our 'clean' data, and then proceed with a toy example. I think that somehow, the way missing values were input is causing serious problems.

In [20]:
#Get the package we need
import pandas as pd

In [21]:
#Upload Data
data = pd.read_csv('C:/Users/Christopher Cook/Downloads/kaggle/movie_metadata.csv')
data = data.dropna(how='any')
data = data.drop(['movie_title','plot_keywords','movie_imdb_link','actor_3_name','actor_2_name','language','genres'],axis=1)
data = pd.get_dummies(data)

In [22]:
data.shape

(3756, 3162)

In [23]:
#Make and standardize Y variable
Y = data['imdb_score']
Y = (Y - Y.mean()) / Y.std()

In [24]:
#Make the X matrix. We need to exclude variables that aren't numbers and the Y variable
X = pd.DataFrame()
for col in list(data):
    if (data[col].dtype == 'float64' or data[col].dtype == 'int64' or data[col].dtype == 'uint8') and (col != 'imdb_score'):
        X[col] = (data[col] - data[col].mean()) / data[col].std()

In [25]:
#Import the Machine learning package and split the data into train and test
from sklearn import linear_model
from sklearn import model_selection
X_train, X_test, y_train, y_test = model_selection.train_test_split(X,Y)

In [26]:
#declare the model
lasso = linear_model.Lasso(alpha=0.005)

In [27]:
#run the model
lasso.fit(X_train, y_train)

Lasso(alpha=0.005, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [28]:
#Score the model
lasso.score(X_test, y_test)

0.37866226764384375

In [29]:
lasso.coef_

array([ 0.15835464,  0.15014814,  0.01769342, ...,  0.05508892,
        0.        ,  0.02315145])

In [30]:
#for completeness I compare it to a linear regression
linear = linear_model.LinearRegression()
linear.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [31]:
#Score linear regression. Note that without lasso biasing the result down we now get an R^2 of 1.
linear.score(X_test,y_test)

-5.9400695913127178e+28

In [40]:
lasso.coef_

array([ 0.15835464,  0.15014814,  0.01769342, ...,  0.05508892,
        0.        ,  0.02315145])

In [44]:
#get unpenalized coeficients:
X_newtrain = pd.DataFrame()
X_newtest = pd.DataFrame()
for var in range(0, len(list(X))):
    if lasso.coef_[var] > 0.01 or lasso.coef_[var] <  -0.01:
        
        X_newtrain[list(X)[var]] = X_train[list(X)[var]]
        X_newtest[list(X)[var]] = X_test[list(X)[var]]
print(len(list(X_newtrain)))
linear2 = linear_model.LinearRegression()
linear2.fit(X_newtrain,y_train)
linear2.score(X_newtest,y_test)

576


0.31303311292858527

I now construct a toy example because the data didn't really work. We mimic what we did in class, where there are 100 X variables but only the first three matter.

In [None]:
#create the data
import numpy as np
X1 = pd.DataFrame(np.random.uniform(low=0,high=1,size=(1000,100)))
eps = pd.Series(np.random.uniform(low=0,high=1,size=1000))
Y1 = 1/4 * (X1[0] + X1[1] + X1[2] + eps)
X1_train, X1_test, Y1_train, Y1_test = model_selection.train_test_split(X1,Y1)

In [76]:
#create and fit regressions
lasso1 = linear_model.Lasso(alpha=0.002)
linear1 = linear_model.LinearRegression()
lasso1.fit(X1_train,Y1_train)
linear1.fit(X1_train,Y1_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [77]:
#score lasso
lasso1.score(X1_test,Y1_test)

0.73022701107144683

In [78]:
#score OLS
linear1.score(X1_test,Y1_test)

0.6692864306058337

As you can see the lasso data did much better on the testing set because it was able to force many of the unimportant coefficients to zero. In fact it looks like only 1 variable outside the true parameters ended up with a non-zero coefficient.

In [79]:
#see lasso params
lasso1.coef_

array([ 0.22657329,  0.23259456,  0.22644108,  0.        ,  0.        ,
       -0.        , -0.        ,  0.        ,  0.        ,  0.        ,
       -0.        , -0.        , -0.        , -0.        ,  0.        ,
       -0.00247162,  0.        , -0.        , -0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        , -0.        ,  0.        ,  0.        ,
       -0.        ,  0.        , -0.        ,  0.        , -0.        ,
       -0.        ,  0.        , -0.        , -0.        ,  0.        ,
        0.        ,  0.        , -0.        ,  0.        ,  0.        ,
       -0.        ,  0.        ,  0.        ,  0.        , -0.        ,
       -0.        , -0.        , -0.        ,  0.        , -0.        ,
        0.        , -0.        , -0.        , -0.        ,  0.        ,
       -0.        ,  0.        ,  0.        , -0.        ,  0.        ,
       -0.        ,  0.        , -0.        , -0.        ,  0.  

In [33]:
print(lasso.coef_[0])

0.158354644303
