In [1]:
import pandas as pd
import numpy as np
import pickle

In [2]:
from sklearn.pipeline import Pipeline

In [3]:
#pip install pickle5
#pip install pyarrow

In [4]:
with open('test.pickle', "rb") as fh:
    data_val = pickle.load(fh)

In [5]:
data1 = pd.read_parquet('train_numerical_features.parquet')
data2 = pd.read_parquet('train_text_features.parquet')

data = data1.merge(data2, on = ['id', 'tagline', 'credits', 'title'])

In [6]:
data.drop(['poster_path', 'backdrop_path', 'recommendations'], axis=1, inplace=True)

data = data[~ (
    (data['revenue']==0) |
    (data['release_date'].isna()) |
    (data['runtime'].isna()) |
    ((data['status'] != 'Released'))
                )]

data['release_date'] = pd.to_datetime(data['release_date'], format = '%Y-%m-%d')

data_cat = data.select_dtypes(include=[object])
data[data_cat.columns] = data_cat.fillna('')

data.loc[:, 'label'] = 'Very Positive'

data.loc[(data['vote_average'] <= 8) & (data['vote_average'] > 7), 'label'] = 'Positive'
data.loc[(data['vote_average'] <= 7) & (data['vote_average'] > 6), 'label'] = 'Mostly Positive'
data.loc[(data['vote_average'] <= 6) & (data['vote_average'] > 5), 'label'] = 'Mixed'
data.loc[(data['vote_average'] <= 5) , 'label'] = 'Negative'

data.drop(['vote_average', 'id', 'status'], axis=1, inplace=True)
data.rename(columns = {'revenue':'target'},
            inplace=True)

In [21]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import confusion_matrix

In [8]:
data.head(1)

Unnamed: 0,title,budget,target,runtime,tagline,credits,genres,original_language,overview,production_companies,release_date,keywords,label
0,Fantastic Beasts: The Secrets of Dumbledore,200000000.0,400000000.0,142.0,Return to the magic.,Jude Law-Eddie Redmayne-Mads Mikkelsen-Ezra Mi...,Fantasy-Adventure-Action,en,Professor Albus Dumbledore knows the powerful ...,Warner Bros. Pictures-Heyday Films,2022-04-06,magic-curse-fantasy world-wizard-magical creat...,Mostly Positive


In [9]:
col_num = ['budget', 'runtime']

In [10]:
#pip install -U scikit-learn==1.1

In [11]:
values = [x.split(sep='-') for x in data_cat['genres'].unique()]
values = [i  for x in values for i in x]

val_unique, counts = np.unique(values, return_counts=True)
series_count = pd.Series(counts, index = val_unique).sort_values(ascending = False).head(10)
    
genres_cols = series_count.index
global genres_cols

In [12]:
def most_freq(dataframe):
    df = {}
    for col in dataframe.columns:
        if col != 'original_language':
            values = [x.split(sep='-') for x in dataframe[col].unique()]
            values = [i  for x in values for i in x]
        else:
            values = dataframe[col]
        val_unique, counts = np.unique(values, return_counts=True)
        series_count = pd.Series(counts, index = val_unique).sort_values(ascending = False).head(10)
    
        
        for value in series_count.index:
            df[f'{col}_{value}'] = pd.DataFrame(np.where(dataframe[col].str.contains(col), 1, 0))
    
    df_final = pd.concat(df.values(), axis=1)
    df_final.columns = df.keys()
    return df_final

transformer = FunctionTransformer(most_freq)

def date_split(df_original):
    df = df_original.copy()
    for col in df.columns:
        df[f'{col}_day'] = df[col].dt.day
        df[f'{col}_month'] = df[col].dt.month
        df[f'{col}_year'] = df[col].dt.year
    return df.drop(col, axis=1)
    
transformer2 = FunctionTransformer(date_split)  

In [13]:
len(data.original_language.unique())

34

In [14]:
preprocessing = ColumnTransformer(
    transformers = [
        ('StandardScaler', StandardScaler(), col_num),
        ('GenresEncoder',transformer, ['genres', 'keywords', 'production_companies', 'original_language']),
        ('DateConverter', transformer2, ['release_date'])
    ])

In [15]:
pipe1 = Pipeline(steps = [
    ('prepro', preprocessing),
    ('model', DecisionTreeClassifier())
])

Clasificación

In [16]:
X_train, X_test, y_train, y_test = train_test_split(
    data[['original_language', 'budget', 'runtime', 'genres',
          'keywords', 'production_companies', 'release_date']],
    data['label'],
    shuffle = True,
    test_size = 0.2,
    random_state = 0
)

In [22]:
pipelines = {'DecisionTree': pipe1}

for name, model in pipelines.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    scores = classification_report(y_test, y_pred)
    print(f'Para el modelo {name} el performance es el siguiente:\n',scores,'\n\n')

Para el modelo DecisionTree el performance es el siguiente:
                  precision    recall  f1-score   support

          Mixed       0.28      0.30      0.29       272
Mostly Positive       0.50      0.47      0.48       611
       Negative       0.06      0.06      0.06        31
       Positive       0.39      0.39      0.39       346
  Very Positive       0.07      0.10      0.08        31

       accuracy                           0.40      1291
      macro avg       0.26      0.27      0.26      1291
   weighted avg       0.40      0.40      0.40      1291
 




In [27]:
y_pred

array(['Positive', 'Negative', 'Positive', ..., 'Positive', 'Positive',
       'Mostly Positive'], dtype=object)

In [36]:
cats = ['Very Positive','Positive','Mostly Positive','Mixed','Negative']

pd.DataFrame(confusion_matrix(y_test,
                              y_pred,
                              labels = cats),
            columns = cats,
            index = cats)

Unnamed: 0,Very Positive,Positive,Mostly Positive,Mixed,Negative
Very Positive,3,11,15,2,0
Positive,23,136,135,46,6
Mostly Positive,14,146,289,148,14
Mixed,1,48,131,82,10
Negative,0,6,13,10,2
