In [46]:
import pandas as pd
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.compose import ColumnTransformer

In [47]:
basics_path = "../data/movies_clean.csv"
data = pd.read_csv(basics_path,low_memory=False)

In [48]:
data= data.drop(columns='tconst')

In [49]:
data.head()

Unnamed: 0,primaryTitle,originalTitle,isAdult,startYear,runtimeMinutes,genres,averageRating,numVotes
0,Miss Jerry,Miss Jerry,0,1894.0,45,Romance,5.2,232
1,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,0,1897.0,100,"Documentary,News,Sport",5.3,584
2,Soldiers of the Cross,Soldiers of the Cross,0,1900.0,40,"Biography,Drama",5.4,67
3,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906.0,70,"Action,Adventure,Biography",6.0,1046
4,The Prodigal Son,L'enfant prodigue,0,1907.0,90,Drama,4.8,37


In [50]:
# Defining the features and the target

X = data.drop(columns='averageRating')
y = data['averageRating']

# Train-Test split

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30)
X_train.shape, X_test.shape, y_train.shape, y_test.shape


((209031, 7), (89585, 7), (209031,), (89585,))

In [51]:

numeric_features = X_train.select_dtypes(include=["int64", "float64", "bool"]).columns

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),  # robust aux outliers
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
    ],
    remainder="drop"
)

model = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("regressor", LinearRegression())
])

In [54]:
# Train Pipeline
model.fit(X_train,y_train)

# Make predictions
print(model.predict(X_test))

# Score model
print(model.score(X_test,y_test))

[6.19466259 6.01583619 6.21161755 ... 6.19708293 6.19519944 6.04881985]
0.008757927903577878


In [55]:
import pickle

# Export Pipeline as pickle file
with open("pipeline_linear_regression.pkl", "wb") as file:
    pickle.dump(model, file)

# Load Pipeline from pickle file
my_pipeline = pickle.load(open("pipeline_linear_regression.pkl","rb"))
