In [1]:
import pandas as pd
import numpy as np
import pickle

In [2]:
from sklearn.pipeline import Pipeline

In [3]:
#pip install pickle5
#pip install pyarrow

In [4]:
with open('test.pickle', "rb") as fh:
    data_val = pickle.load(fh)

In [5]:
data1 = pd.read_parquet('train_numerical_features.parquet')
data2 = pd.read_parquet('train_text_features.parquet')

data = data1.merge(data2, on = ['id', 'tagline', 'credits', 'title'])

In [6]:
data.drop(['poster_path', 'backdrop_path', 'recommendations'], axis=1, inplace=True)

data = data[~ (
    (data['revenue']==0) |
    (data['release_date'].isna()) |
    (data['runtime'].isna()) |
    ((data['status'] != 'Released'))
                )]

data['release_date'] = pd.to_datetime(data['release_date'], format = '%Y-%m-%d')

data_cat = data.select_dtypes(include=[object])
data[data_cat.columns] = data_cat.fillna('')

data.loc[:, 'label'] = 'Very Positive'

data.loc[(data['vote_average'] <= 8) & (data['vote_average'] > 7), 'label'] = 'Positive'
data.loc[(data['vote_average'] <= 7) & (data['vote_average'] > 6), 'label'] = 'Mostly Positive'
data.loc[(data['vote_average'] <= 6) & (data['vote_average'] > 5), 'label'] = 'Mixed'
data.loc[(data['vote_average'] <= 5) , 'label'] = 'Negative'

data.drop(['vote_average', 'id', 'status'], axis=1, inplace=True)
data.rename(columns = {'revenue':'target'},
            inplace=True)

In [7]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import FunctionTransformer

In [8]:
data.head(1)

Unnamed: 0,title,budget,target,runtime,tagline,credits,genres,original_language,overview,production_companies,release_date,keywords,label
0,Fantastic Beasts: The Secrets of Dumbledore,200000000.0,400000000.0,142.0,Return to the magic.,Jude Law-Eddie Redmayne-Mads Mikkelsen-Ezra Mi...,Fantasy-Adventure-Action,en,Professor Albus Dumbledore knows the powerful ...,Warner Bros. Pictures-Heyday Films,2022-04-06,magic-curse-fantasy world-wizard-magical creat...,Mostly Positive


In [9]:
col_num = data.select_dtypes(include=[np.number]).columns

In [10]:
#pip install -U scikit-learn==1.1

In [121]:
values = [x.split(sep='-') for x in data_cat['genres'].unique()]
values = [i  for x in values for i in x]

val_unique, counts = np.unique(values, return_counts=True)
series_count = pd.Series(counts, index = val_unique).sort_values(ascending = False).head(10)
    
genres_cols = series_count.index
global genres_cols

In [122]:
def generos(dataframe):
    df = {}
    for genero in genres_cols:
        df[f'genre_{genero}'] = pd.DataFrame(np.where(dataframe['genres'].str.contains(genero), 1, 0))
    df_final = pd.concat(df.values(), axis=1)
    df_final.columns = df.keys()
    return df_final

transformer = FunctionTransformer(generos)

In [123]:
preprocessing = ColumnTransformer(
    transformers = [
        ('StandardScaler', StandardScaler(), col_num),
        ('OneHotEncoder', OneHotEncoder(max_categories = 1, handle_unknown = 'ignore'),
         ['original_language']),
        ('GenresEncoder',transformer, ['genres'])
    ])

In [124]:
pipe1 = Pipeline(steps = [
    ('prepro', preprocessing),
    ('model', DecisionTreeClassifier())
])

Clasificación

In [129]:
X_train, X_test, y_train, y_test = train_test_split(
    data[['original_language', 'budget', 'target', 'runtime', 'genres']],
    data['label'],
    shuffle = True,
    test_size = 0.2,
    random_state = 0
)

In [130]:
pipelines = {'DecisionTree': pipe1}

In [131]:
for name, model in pipelines.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    scores = classification_report(y_test, y_pred)
    print(f'Para el modelo {name} el performance es el siguiente:\n',scores,'\n\n')

Para el modelo DecisionTree el performance es el siguiente:
                  precision    recall  f1-score   support

          Mixed       0.33      0.32      0.33       272
Mostly Positive       0.52      0.52      0.52       611
       Negative       0.03      0.03      0.03        31
       Positive       0.44      0.45      0.44       346
  Very Positive       0.09      0.10      0.10        31

       accuracy                           0.43      1291
      macro avg       0.28      0.28      0.28      1291
   weighted avg       0.44      0.43      0.43      1291
 




In [132]:
model.feature_names_in_

array(['original_language', 'budget', 'target', 'runtime', 'genres'],
      dtype=object)