In [55]:
import pandas as pd
import numpy as np
import pickle

In [56]:
from sklearn.pipeline import Pipeline

In [57]:
#pip install pickle5
#pip install pyarrow

In [58]:
with open('test.pickle', "rb") as fh:
    data_val = pickle.load(fh)

In [59]:
data1 = pd.read_parquet('train_numerical_features.parquet')
data2 = pd.read_parquet('train_text_features.parquet')

data = data1.merge(data2, on = ['id', 'tagline', 'credits', 'title'])

In [60]:
data.drop(['poster_path', 'backdrop_path', 'recommendations'], axis=1, inplace=True)

data = data[~ (
    (data['revenue']==0) |
    (data['release_date'].isna()) |
    (data['runtime'].isna()) |
    ((data['status'] != 'Released'))
                )]

data['release_date'] = pd.to_datetime(data['release_date'], format = '%Y-%m-%d')

data_cat = data.select_dtypes(include=[object])
data[data_cat.columns] = data_cat.fillna('')

data.loc[:, 'label'] = 'Very Positive'

data.loc[(data['vote_average'] <= 8) & (data['vote_average'] > 7), 'label'] = 'Positive'
data.loc[(data['vote_average'] <= 7) & (data['vote_average'] > 6), 'label'] = 'Mostly Positive'
data.loc[(data['vote_average'] <= 6) & (data['vote_average'] > 5), 'label'] = 'Mixed'
data.loc[(data['vote_average'] <= 5) , 'label'] = 'Negative'

data.drop(['vote_average', 'id', 'status'], axis=1, inplace=True)
data.rename(columns = {'revenue':'target'},
            inplace=True)

In [61]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import confusion_matrix

In [62]:
col_num = ['budget', 'runtime']

In [63]:
#pip install -U scikit-learn==1.1

In [64]:
values = [x.split(sep='-') for x in data_cat['genres'].unique()]
values = [i  for x in values for i in x]

val_unique, counts = np.unique(values, return_counts=True)
series_count = pd.Series(counts, index = val_unique).sort_values(ascending = False).head(10)
    
genres_cols = series_count.index
global genres_cols

In [123]:
def most_freq(dataframe):
    df = {}
    for col in dataframe.columns:
        if col != 'original_language':
            values = [x.split(sep='-') for x in dataframe[col]]
            values = [i  for x in values for i in x]
        else:
            values = dataframe[col]
        val_unique, counts = np.unique(values, return_counts=True)
        series_count = pd.Series(counts, index = val_unique).sort_values(ascending = False).head(10)
    
        
        for value in series_count.index:
            df[f'{col}_{value}'] = pd.DataFrame(np.where(dataframe[col].str.contains(col), 1, 0))
    
    df_final = pd.concat(df.values(), axis=1)
    df_final.columns = df.keys()
    return df_final

transformer = FunctionTransformer(most_freq)

def date_split(df_original):
    df = df_original.copy()
    for col in df.columns:
        df[f'{col}_day'] = df[col].dt.day
        df[f'{col}_month'] = df[col].dt.month
        df[f'{col}_year'] = df[col].dt.year
    return df.drop(col, axis=1)
    
transformer2 = FunctionTransformer(date_split)  

In [124]:
len(data.original_language.unique())

34

In [125]:
data.head()

Unnamed: 0,title,budget,target,runtime,tagline,credits,genres,original_language,overview,production_companies,release_date,keywords,label,budget/runtime
0,Fantastic Beasts: The Secrets of Dumbledore,200000000.0,400000000.0,142.0,Return to the magic.,Jude Law-Eddie Redmayne-Mads Mikkelsen-Ezra Mi...,Fantasy-Adventure-Action,en,Professor Albus Dumbledore knows the powerful ...,Warner Bros. Pictures-Heyday Films,2022-04-06,magic-curse-fantasy world-wizard-magical creat...,Mostly Positive,1408451.0
1,Sonic the Hedgehog 2,110000000.0,393000000.0,122.0,Welcome to the next level.,James Marsden-Ben Schwartz-Tika Sumpter-Natash...,Action-Adventure-Family-Comedy,en,After settling in Green Hills Sonic is eager t...,Original Film-Blur Studio-Marza Animation Plan...,2022-03-30,sequel-based on video game-hedgehog-live actio...,Positive,901639.3
2,The Lost City,74000000.0,164289828.0,112.0,The adventure is real. The heroes are not.,Sandra Bullock-Channing Tatum-Daniel Radcliffe...,Action-Adventure-Comedy,en,A reclusive romance novelist was sure nothing ...,Paramount-Fortis Films-3dot Productions-Exhibi...,2022-03-24,duringcreditsstinger,Mostly Positive,660714.3
3,Morbius,75000000.0,161000000.0,105.0,A new Marvel legend arrives.,Jared Leto-Matt Smith-Adria Arjona-Jared Harri...,Action-Science Fiction-Fantasy,en,Dangerously ill with a rare blood disorder and...,Columbia Pictures-Avi Arad Productions-Matt To...,2022-03-30,vampire-based on comic,Mostly Positive,714285.7
4,Uncharted,120000000.0,400780000.0,116.0,Fortune favors the bold.,Tom Holland-Mark Wahlberg-Sophia Ali-Tati Gabr...,Action-Adventure,en,A young street-smart Nathan Drake and his wise...,Columbia Pictures-Atlas Entertainment-PlayStat...,2022-02-10,treasure-treasure hunt-based on video game-dlb,Positive,1034483.0


In [126]:
data['budget/runtime'] = data['budget']/data['runtime']

In [127]:
data.head(1)

Unnamed: 0,title,budget,target,runtime,tagline,credits,genres,original_language,overview,production_companies,release_date,keywords,label,budget/runtime
0,Fantastic Beasts: The Secrets of Dumbledore,200000000.0,400000000.0,142.0,Return to the magic.,Jude Law-Eddie Redmayne-Mads Mikkelsen-Ezra Mi...,Fantasy-Adventure-Action,en,Professor Albus Dumbledore knows the powerful ...,Warner Bros. Pictures-Heyday Films,2022-04-06,magic-curse-fantasy world-wizard-magical creat...,Mostly Positive,1408451.0


In [128]:
preprocessing = ColumnTransformer(
    transformers = [
        ('StandardScaler', StandardScaler(), col_num),
        ('GenresEncoder',transformer, ['genres', 'keywords', 'production_companies', 'original_language']),
        ('DateConverter', transformer2, ['release_date'])
    ])

In [97]:
preprocessing = ColumnTransformer(
    transformers = [
        ('StandardScaler', StandardScaler(), col_num)
    ])

In [129]:
pipe1 = Pipeline(steps = [
    ('prepro', preprocessing),
    ('model', DecisionTreeClassifier())
])

Clasificación

In [130]:
X_train, X_test, y_train, y_test = train_test_split(
    data[['original_language', 'budget', 'runtime', 'genres',
          'keywords', 'production_companies', 'release_date', 'budget/runtime']],
    data['label'],
    shuffle = True,
    test_size = 0.2,
    random_state = 0
)

In [131]:
pipelines = {'DecisionTree': pipe1}

for name, model in pipelines.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    scores = classification_report(y_test, y_pred)
    print(f'Para el modelo {name} el performance es el siguiente:\n',scores,'\n\n')

Para el modelo DecisionTree el performance es el siguiente:
                  precision    recall  f1-score   support

          Mixed       0.30      0.31      0.30       272
Mostly Positive       0.53      0.51      0.52       611
       Negative       0.07      0.06      0.07        31
       Positive       0.41      0.41      0.41       346
  Very Positive       0.10      0.13      0.11        31

       accuracy                           0.42      1291
      macro avg       0.28      0.28      0.28      1291
   weighted avg       0.43      0.42      0.42      1291
 




In [132]:
y_pred

array(['Positive', 'Mostly Positive', 'Positive', ..., 'Positive',
       'Positive', 'Mostly Positive'], dtype=object)

In [133]:
cats = ['Very Positive','Positive','Mostly Positive','Mixed','Negative']

pd.DataFrame(confusion_matrix(y_test,
                              y_pred,
                              labels = cats),
            columns = cats,
            index = cats)

Unnamed: 0,Very Positive,Positive,Mostly Positive,Mixed,Negative
Very Positive,4,11,14,2,0
Positive,22,141,129,53,1
Mostly Positive,14,135,313,134,15
Mixed,2,50,125,84,11
Negative,0,5,15,9,2


In [134]:
from sklearn.model_selection import GridSearchCV

In [135]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [136]:
pipe_grid = Pipeline(steps = [
    ('prepro', preprocessing),
    ('modelo', DecisionTreeClassifier())
])

In [137]:
params = [
    {
        'modelo': [LogisticRegression(random_state = 0)],
        'modelo__penalty': ['l1', 'l2'],
        'modelo__solver': ['liblinear']
    },{
        'modelo': [RandomForestClassifier(random_state=0)],
        'modelo__max_depth': [2,4,6,8,10]
    }]

In [138]:
grid_model = GridSearchCV(pipe_grid,
                           param_grid=params,
                           verbose=10,
                            scoring = 'f1_macro',
                         cv=5)
grid_model.fit(X_train, y_train)

Fitting 5 folds for each of 7 candidates, totalling 35 fits
[CV 1/5; 1/7] START modelo=LogisticRegression(random_state=0), modelo__penalty=l1, modelo__solver=liblinear




[CV 1/5; 1/7] END modelo=LogisticRegression(random_state=0), modelo__penalty=l1, modelo__solver=liblinear;, score=0.186 total time=   0.6s
[CV 2/5; 1/7] START modelo=LogisticRegression(random_state=0), modelo__penalty=l1, modelo__solver=liblinear
[CV 2/5; 1/7] END modelo=LogisticRegression(random_state=0), modelo__penalty=l1, modelo__solver=liblinear;, score=0.193 total time=   0.3s
[CV 3/5; 1/7] START modelo=LogisticRegression(random_state=0), modelo__penalty=l1, modelo__solver=liblinear




[CV 3/5; 1/7] END modelo=LogisticRegression(random_state=0), modelo__penalty=l1, modelo__solver=liblinear;, score=0.195 total time=   0.5s
[CV 4/5; 1/7] START modelo=LogisticRegression(random_state=0), modelo__penalty=l1, modelo__solver=liblinear




[CV 4/5; 1/7] END modelo=LogisticRegression(random_state=0), modelo__penalty=l1, modelo__solver=liblinear;, score=0.191 total time=   0.5s
[CV 5/5; 1/7] START modelo=LogisticRegression(random_state=0), modelo__penalty=l1, modelo__solver=liblinear




[CV 5/5; 1/7] END modelo=LogisticRegression(random_state=0), modelo__penalty=l1, modelo__solver=liblinear;, score=0.191 total time=   0.3s
[CV 1/5; 2/7] START modelo=LogisticRegression(random_state=0), modelo__penalty=l2, modelo__solver=liblinear
[CV 1/5; 2/7] END modelo=LogisticRegression(random_state=0), modelo__penalty=l2, modelo__solver=liblinear;, score=0.180 total time=   0.2s
[CV 2/5; 2/7] START modelo=LogisticRegression(random_state=0), modelo__penalty=l2, modelo__solver=liblinear
[CV 2/5; 2/7] END modelo=LogisticRegression(random_state=0), modelo__penalty=l2, modelo__solver=liblinear;, score=0.189 total time=   0.8s
[CV 3/5; 2/7] START modelo=LogisticRegression(random_state=0), modelo__penalty=l2, modelo__solver=liblinear
[CV 3/5; 2/7] END modelo=LogisticRegression(random_state=0), modelo__penalty=l2, modelo__solver=liblinear;, score=0.195 total time=   0.2s
[CV 4/5; 2/7] START modelo=LogisticRegression(random_state=0), modelo__penalty=l2, modelo__solver=liblinear
[CV 4/5; 2/7

In [139]:
results = pd.DataFrame(grid_model.cv_results_).sort_values('rank_test_score', ascending = True).head(20)

In [140]:
results.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7 entries, 6 to 2
Data columns (total 17 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean_fit_time            7 non-null      float64
 1   std_fit_time             7 non-null      float64
 2   mean_score_time          7 non-null      float64
 3   std_score_time           7 non-null      float64
 4   param_modelo             7 non-null      object 
 5   param_modelo__penalty    2 non-null      object 
 6   param_modelo__solver     2 non-null      object 
 7   param_modelo__max_depth  5 non-null      object 
 8   params                   7 non-null      object 
 9   split0_test_score        7 non-null      float64
 10  split1_test_score        7 non-null      float64
 11  split2_test_score        7 non-null      float64
 12  split3_test_score        7 non-null      float64
 13  split4_test_score        7 non-null      float64
 14  mean_test_score          7 non

In [141]:
results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_modelo,param_modelo__penalty,param_modelo__solver,param_modelo__max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
6,5.439475,0.30478,0.109117,0.042702,"RandomForestClassifier(max_depth=10, random_st...",,,10.0,{'modelo': RandomForestClassifier(max_depth=10...,0.242022,0.234519,0.240695,0.237299,0.235902,0.238087,0.002843,1
5,5.339712,0.129377,0.108757,0.028265,"RandomForestClassifier(max_depth=10, random_st...",,,8.0,{'modelo': RandomForestClassifier(max_depth=10...,0.222464,0.221723,0.214587,0.22,0.231873,0.222129,0.005597,2
4,5.160801,0.12878,0.090948,0.020115,"RandomForestClassifier(max_depth=10, random_st...",,,6.0,{'modelo': RandomForestClassifier(max_depth=10...,0.205602,0.212718,0.211933,0.210406,0.210392,0.21021,0.002472,3
3,4.784358,0.200318,0.324991,0.41303,"RandomForestClassifier(max_depth=10, random_st...",,,4.0,{'modelo': RandomForestClassifier(max_depth=10...,0.201745,0.202858,0.206735,0.20139,0.199345,0.202414,0.002441,4
0,0.427263,0.052329,0.107051,0.068024,LogisticRegression(random_state=0),l1,liblinear,,"{'modelo': LogisticRegression(random_state=0),...",0.186114,0.193279,0.19507,0.191316,0.19104,0.191364,0.003003,5
1,0.35692,0.243531,0.059742,0.005817,LogisticRegression(random_state=0),l2,liblinear,,"{'modelo': LogisticRegression(random_state=0),...",0.180295,0.188665,0.19507,0.191316,0.192544,0.189578,0.005079,6
2,3.737473,0.256745,0.096322,0.016698,"RandomForestClassifier(max_depth=10, random_st...",,,2.0,{'modelo': RandomForestClassifier(max_depth=10...,0.175839,0.169789,0.170265,0.162481,0.16319,0.168313,0.004957,7


In [142]:
y_pred = grid_model.predict(X_test)
print(classification_report(y_test, y_pred))

                 precision    recall  f1-score   support

          Mixed       0.40      0.07      0.12       272
Mostly Positive       0.51      0.83      0.63       611
       Negative       0.00      0.00      0.00        31
       Positive       0.57      0.39      0.46       346
  Very Positive       0.00      0.00      0.00        31

       accuracy                           0.51      1291
      macro avg       0.29      0.26      0.24      1291
   weighted avg       0.48      0.51      0.45      1291



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
