# Model Performance Transformations

Lets practice some basic data transformation for ML performance enhancement

In [159]:
# Imports

import numpy as np
import pandas as pd
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score


In [122]:
# Categorical data analyser

def cat_var(df, cols):
    '''
    Return: a Pandas dataframe object with the following columns:
        - "categorical_variable" => every categorical variable include as an input parameter (string).
        - "number_of_possible_values" => the amount of unique values that can take a given categorical variable (integer).
        - "values" => a list with the posible unique values for every categorical variable (list).

    Input parameters:
        - df -> Pandas dataframe object: a dataframe with categorical variables.
        - cols -> list object: a list with the name (string) of every categorical variable to analyse.
    '''
    cat_list = []
    for col in cols:
        cat = df[col].unique()
        cat_num = len(cat)
        cat_dict = {"categorical_variable":col,
                    "number_of_possible_values":cat_num,
                    "values":cat}
        cat_list.append(cat_dict)
    df = pd.DataFrame(cat_list).sort_values(by="number_of_possible_values", ascending=False)
    return df.reset_index(drop=True)

## Scaling

Some ML algorithms have problems performing well whenever the data scale differ greatly between features. In those cases scaling the data is your best option.

- [RobustScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html#sklearn.preprocessing.RobustScaler)

- [StandardScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html#sklearn.preprocessing.StandardScaler)

Try both options and see what happens with performance (i.e.: AUC).

<img src="../images/scaling.png" alt="Drawing" style="width: 500px;"/>

In [123]:
# Weather dataset (https://www.kaggle.com/jsphyg/weather-dataset-rattle-package)

weather = pd.read_csv('../data/weatherAUS.csv')
print(weather.shape)
weather.head()

(145460, 23)


Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No


In [124]:
weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145460 entries, 0 to 145459
Data columns (total 23 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Date           145460 non-null  object 
 1   Location       145460 non-null  object 
 2   MinTemp        143975 non-null  float64
 3   MaxTemp        144199 non-null  float64
 4   Rainfall       142199 non-null  float64
 5   Evaporation    82670 non-null   float64
 6   Sunshine       75625 non-null   float64
 7   WindGustDir    135134 non-null  object 
 8   WindGustSpeed  135197 non-null  float64
 9   WindDir9am     134894 non-null  object 
 10  WindDir3pm     141232 non-null  object 
 11  WindSpeed9am   143693 non-null  float64
 12  WindSpeed3pm   142398 non-null  float64
 13  Humidity9am    142806 non-null  float64
 14  Humidity3pm    140953 non-null  float64
 15  Pressure9am    130395 non-null  float64
 16  Pressure3pm    130432 non-null  float64
 17  Cloud9am       89572 non-null

In [125]:
# Uluru weather (numerical features)

weather = weather[weather['Location'].isin(['Uluru'])].reset_index(drop=True)
weather = weather[weather['RainToday'].isin(['No','Yes'])].reset_index(drop=True)
weather = weather[weather['RainTomorrow'].isin(['No','Yes'])]
weather = weather[['MinTemp',
                   'MaxTemp',
                   'Rainfall',
                   'WindSpeed9am',
                   'WindSpeed3pm',
                   'Humidity9am',
                   'Humidity3pm',
                   'Pressure9am',
                   'Pressure3pm',
                   'Temp9am',
                   'Temp3pm',
                   'RainTomorrow']]
weather = weather.dropna().reset_index(drop=True)
col_weather = list(weather.columns)
print(col_weather)
print(weather.shape)
print(weather.describe())
weather.head()

['MinTemp', 'MaxTemp', 'Rainfall', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Temp9am', 'Temp3pm', 'RainTomorrow']
(1479, 12)
           MinTemp      MaxTemp     Rainfall  WindSpeed9am  WindSpeed3pm  \
count  1479.000000  1479.000000  1479.000000   1479.000000   1479.000000   
mean     14.368627    30.402299     0.716700     17.613928     17.050710   
std       7.432857     7.624058     4.208585      7.887082      6.893016   
min      -1.900000    11.300000     0.000000      0.000000      0.000000   
25%       8.100000    23.800000     0.000000     11.000000     11.000000   
50%      14.900000    31.200000     0.000000     17.000000     17.000000   
75%      20.800000    37.100000     0.000000     24.000000     22.000000   
max      31.000000    44.400000    83.800000     41.000000     48.000000   

       Humidity9am  Humidity3pm  Pressure9am  Pressure3pm      Temp9am  \
count  1479.000000  1479.000000  1479.000000  1479.000000  1479.0

Unnamed: 0,MinTemp,MaxTemp,Rainfall,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Temp9am,Temp3pm,RainTomorrow
0,19.7,30.0,0.8,30.0,24.0,76.0,54.0,1010.6,1007.5,21.7,28.4,No
1,21.6,33.1,0.0,22.0,11.0,44.0,33.0,1010.5,1006.5,24.6,31.3,No
2,21.3,36.1,0.0,24.0,13.0,39.0,27.0,1006.9,1002.7,27.6,34.5,No
3,22.9,37.7,0.0,28.0,13.0,35.0,22.0,1006.0,1002.1,28.7,35.4,No
4,24.0,39.0,0.0,20.0,19.0,33.0,21.0,1006.9,1003.5,29.9,37.3,No


In [126]:
weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1479 entries, 0 to 1478
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   MinTemp       1479 non-null   float64
 1   MaxTemp       1479 non-null   float64
 2   Rainfall      1479 non-null   float64
 3   WindSpeed9am  1479 non-null   float64
 4   WindSpeed3pm  1479 non-null   float64
 5   Humidity9am   1479 non-null   float64
 6   Humidity3pm   1479 non-null   float64
 7   Pressure9am   1479 non-null   float64
 8   Pressure3pm   1479 non-null   float64
 9   Temp9am       1479 non-null   float64
 10  Temp3pm       1479 non-null   float64
 11  RainTomorrow  1479 non-null   object 
dtypes: float64(11), object(1)
memory usage: 138.8+ KB


In [127]:
# Features + target

X = weather[['MinTemp',
          'MaxTemp',
          'Rainfall',
          'WindSpeed9am',
          'WindSpeed3pm',
          'Humidity9am',
          'Humidity3pm',
          'Pressure9am',
          'Pressure3pm',
          'Temp9am',
          'Temp3pm']]
y = pd.get_dummies(weather['RainTomorrow'], drop_first=True)['Yes']
print(X.shape,y.shape)

(1479, 11) (1479,)


In [128]:
# Train + test

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"X_train: {X_train.shape}, X_test: {X_test.shape}, y_train: {y_train.shape}, y_test: {y_test.shape}")
print(f"X_train: {type(X_train)}, X_test: {type(X_test)}, y_train: {type(y_train)}, y_test: {type(y_test)}")

X_train: (1183, 11), X_test: (296, 11), y_train: (1183,), y_test: (296,)
X_train: <class 'pandas.core.frame.DataFrame'>, X_test: <class 'pandas.core.frame.DataFrame'>, y_train: <class 'pandas.core.series.Series'>, y_test: <class 'pandas.core.series.Series'>


In [137]:
# Scaling

#scaler = StandardScaler()
#scaler = MinMaxScaler()
scaler = RobustScaler()

In [138]:
X_train = scaler.fit_transform(X_train)
X_test  = scaler.fit_transform(X_test)


In [139]:
# Linear model

linear_model = LogisticRegression(max_iter=1000)
linear_param = linear_model.fit(X_train, y_train)
linear_pred = linear_model.predict(X_test)
linear_auc = roc_auc_score(y_test, linear_pred)
print(f"Linear model AUC is: {linear_auc}")


Linear model AUC is: 0.6542846285388563


In [140]:
# Ensemble model

ensemble_model = RandomForestRegressor()
ensemble_param = ensemble_model.fit(X_train, y_train)
ensemble_pred = ensemble_model.predict(X_test)
ensemble_auc = roc_auc_score(y_test, ensemble_pred)
print(f"Linear model AUC is: {ensemble_auc}")


Linear model AUC is: 0.801349040471214


## **El mejor modelo RandomForest con StandardScaler

---

## Enconding

ML algorithms do not support categorical data. Therefore you need to find a way to transform categorical data into numerical. You must compare the results using both techniques: __One Hot Encoding__ or __Label Encoding__

- [OneHotEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html#sklearn.preprocessing.OneHotEncoder)

- [LabelEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html#sklearn.preprocessing.LabelEncoder)

<img src="../images/encoding.png" alt="Drawing" style="width: 500px;"/>

In [141]:
# Mushrooms dataset (https://www.kaggle.com/uciml/mushroom-classification)

mushrooms = pd.read_csv('../data/mushrooms.csv')
col_mushrooms = list(mushrooms.columns)
print(mushrooms.shape)
mushrooms.head()

(8124, 23)


Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [142]:
mushrooms.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   class                     8124 non-null   object
 1   cap-shape                 8124 non-null   object
 2   cap-surface               8124 non-null   object
 3   cap-color                 8124 non-null   object
 4   bruises                   8124 non-null   object
 5   odor                      8124 non-null   object
 6   gill-attachment           8124 non-null   object
 7   gill-spacing              8124 non-null   object
 8   gill-size                 8124 non-null   object
 9   gill-color                8124 non-null   object
 10  stalk-shape               8124 non-null   object
 11  stalk-root                8124 non-null   object
 12  stalk-surface-above-ring  8124 non-null   object
 13  stalk-surface-below-ring  8124 non-null   object
 14  stalk-color-above-ring  

In [143]:
# Features analysis

cat_mushrooms = cat_var(mushrooms, col_mushrooms)
cat_mushrooms

Unnamed: 0,categorical_variable,number_of_possible_values,values
0,gill-color,12,"[k, n, g, p, w, h, u, e, b, r, y, o]"
1,cap-color,10,"[n, y, w, g, e, p, b, u, c, r]"
2,spore-print-color,9,"[k, n, u, h, w, r, o, y, b]"
3,odor,9,"[p, a, l, n, f, c, y, s, m]"
4,stalk-color-below-ring,9,"[w, p, g, b, n, e, y, o, c]"
5,stalk-color-above-ring,9,"[w, g, p, n, b, e, o, c, y]"
6,habitat,7,"[u, g, m, d, p, w, l]"
7,cap-shape,6,"[x, b, s, f, k, c]"
8,population,6,"[s, n, a, v, y, c]"
9,ring-type,5,"[p, e, l, f, n]"


In [145]:
mushrooms_le=mushrooms
for col in mushrooms_le.drop('class', axis=1).columns:
    mushrooms_le[col] = LabelEncoder().fit_transform(mushrooms_le[col])
mushrooms_le


Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,e,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,e,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,p,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,e,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,e,3,2,4,0,5,0,0,0,11,...,2,5,5,0,1,1,4,0,1,2
8120,e,5,2,4,0,5,0,0,0,11,...,2,5,5,0,0,1,4,0,4,2
8121,e,2,2,4,0,5,0,0,0,5,...,2,5,5,0,1,1,4,0,1,2
8122,p,3,3,4,0,8,1,0,1,0,...,1,7,7,0,2,1,0,7,4,2


In [146]:
# Features + target (encoding). IMPORTANT: you may pick any of the 2-labeled features as you target (choose wisely!!!)

mushrooms_le = mushrooms
for col in mushrooms_le.drop('class', axis=1).columns:
    mushrooms_le[col] = LabelEncoder().fit_transform(mushrooms_le[col])
mushrooms_le



Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,e,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,e,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,p,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,e,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,e,3,2,4,0,5,0,0,0,11,...,2,5,5,0,1,1,4,0,1,2
8120,e,5,2,4,0,5,0,0,0,11,...,2,5,5,0,0,1,4,0,4,2
8121,e,2,2,4,0,5,0,0,0,5,...,2,5,5,0,1,1,4,0,1,2
8122,p,3,3,4,0,8,1,0,1,0,...,1,7,7,0,2,1,0,7,4,2


In [147]:
mushrooms_encoded = pd.concat([mushrooms.drop(list(cat_mushrooms['categorical_variable']), axis = 1),pd.get_dummies(mushrooms[list(cat_mushrooms['categorical_variable'])],drop_first=True)],axis =1)
mushrooms_encoded

Unnamed: 0,gill-color,cap-color,spore-print-color,odor,stalk-color-below-ring,stalk-color-above-ring,habitat,cap-shape,population,ring-type,...,veil-color,cap-surface,ring-number,stalk-shape,gill-size,gill-spacing,gill-attachment,bruises,veil-type,class_p
0,4,4,2,6,7,7,5,5,3,4,...,2,2,1,0,1,0,1,1,0,True
1,4,9,3,0,7,7,1,5,2,4,...,2,2,1,0,0,0,1,1,0,False
2,5,8,3,3,7,7,3,0,2,4,...,2,2,1,0,0,0,1,1,0,False
3,5,8,2,6,7,7,5,5,3,4,...,2,3,1,0,1,0,1,1,0,True
4,4,3,3,5,7,7,1,5,0,0,...,2,2,1,1,0,1,1,0,0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,11,4,0,5,5,5,2,3,1,4,...,1,2,1,0,0,0,0,0,0,False
8120,11,4,0,5,5,5,2,5,4,4,...,0,2,1,0,0,0,0,0,0,False
8121,5,4,0,5,5,5,2,2,1,4,...,1,2,1,0,0,0,0,0,0,False
8122,0,4,7,8,7,7,2,3,4,0,...,2,3,1,1,1,0,1,0,0,True


In [148]:
X = mushrooms_encoded.drop('class_p', axis = 1)
y = mushrooms_encoded['class_p']
print(X.shape,y.shape)

(8124, 22) (8124,)


In [149]:
# Train + test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"X_train: {X_train.shape}, X_test: {X_test.shape}, y_train: {y_train.shape}, y_test: {y_test.shape}")
print(f"X_train: {type(X_train)}, X_test: {type(X_test)}, y_train: {type(y_train)}, y_test: {type(y_test)}")


X_train: (6499, 22), X_test: (1625, 22), y_train: (6499,), y_test: (1625,)
X_train: <class 'pandas.core.frame.DataFrame'>, X_test: <class 'pandas.core.frame.DataFrame'>, y_train: <class 'pandas.core.series.Series'>, y_test: <class 'pandas.core.series.Series'>


In [150]:
# Scaling

#scaler = StandardScaler()
#scaler = MinMaxScaler()
#scaler = RobustScaler()

#NOT NEEDED Features categg


In [151]:
X_train = scaler.fit_transform(X_train)
X_test  = scaler.fit_transform(X_test)

In [181]:
# Linear model
#no vale para modelos Classification
#linear_model = LogisticRegression(max_iter=1000)
#linear_param = linear_model.fit(X_train, y_train)
#linear_pred = linear_model.predict(X_test)
#linear_auc = roc_auc_score(y_test, linear_pred)
#print(f"Linear model AUC is: {linear_auc}")

Linear model AUC is: 0.9096781680334209


In [165]:
model = SGDClassifier(loss = 'log_loss') 
model.fit(X_train, y_train)
predictions = model.predict(X_test)
check = pd.DataFrame({'Ground truth':y_test, 'Predictions':predictions, 'Diff':np.logical_xor(y_test, predictions)})

accuracy=model.score(X_test, y_test)
accuracy

0.8916923076923077

In [154]:
# Ensemble model

ensemble_model = RandomForestClassifier()
ensemble_param = ensemble_model.fit(X_train, y_train)
ensemble_pred = ensemble_model.predict(X_test)
ensemble_auc = roc_auc_score(y_test, ensemble_pred)
print(f"Linear model AUC is: {ensemble_auc}")

Linear model AUC is: 1.0


## El Random forest siempre da 1 (la mejor opción independientemente del scaling), para el lineal es mejor el StandarScaler

---

## Bonus

Now that you can grasp the potential of pre-processing your data...what would you do about the following dataset?

<img src="../images/bonus.jpg" alt="Drawing" style="width: 500px;"/>

In [166]:
netflix.columns

Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description'],
      dtype='object')

In [172]:
# Netflix dataset (https://www.kaggle.com/shivamb/netflix-shows)

netflix = pd.read_csv('../data/netflix_titles.csv',sep=",")
col_netflix = list(netflix.columns)
print(netflix.shape)
netflix.head()

(7787, 12)


Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,TV Show,3%,,"João Miguel, Bianca Comparato, Michel Gomes, R...",Brazil,"August 14, 2020",2020,TV-MA,4 Seasons,"International TV Shows, TV Dramas, TV Sci-Fi &...",In a future where the elite inhabit an island ...
1,s2,Movie,7:19,Jorge Michel Grau,"Demián Bichir, Héctor Bonilla, Oscar Serrano, ...",Mexico,"December 23, 2016",2016,TV-MA,93 min,"Dramas, International Movies",After a devastating earthquake hits Mexico Cit...
2,s3,Movie,23:59,Gilbert Chan,"Tedd Chan, Stella Chung, Henley Hii, Lawrence ...",Singapore,"December 20, 2018",2011,R,78 min,"Horror Movies, International Movies","When an army recruit is found dead, his fellow..."
3,s4,Movie,9,Shane Acker,"Elijah Wood, John C. Reilly, Jennifer Connelly...",United States,"November 16, 2017",2009,PG-13,80 min,"Action & Adventure, Independent Movies, Sci-Fi...","In a postapocalyptic world, rag-doll robots hi..."
4,s5,Movie,21,Robert Luketic,"Jim Sturgess, Kevin Spacey, Kate Bosworth, Aar...",United States,"January 1, 2020",2008,PG-13,123 min,Dramas,A brilliant group of students become card-coun...


In [162]:
# ML workflow -> ¿what would you do?

netflix.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7787 entries, 0 to 7786
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       7787 non-null   object
 1   type          7787 non-null   object
 2   title         7787 non-null   object
 3   director      5398 non-null   object
 4   cast          7069 non-null   object
 5   country       7280 non-null   object
 6   date_added    7777 non-null   object
 7   release_year  7787 non-null   int64 
 8   rating        7780 non-null   object
 9   duration      7787 non-null   object
 10  listed_in     7787 non-null   object
 11  description   7787 non-null   object
dtypes: int64(1), object(11)
memory usage: 730.2+ KB


In [None]:
netflix.info()

In [163]:
cat_netflix = cat_var(netflix, col_netflix)
cat_netflix 

Unnamed: 0,categorical_variable,number_of_possible_values,values
0,show_id,7787,"[s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,..."
1,title,7787,"[3%, 7:19, 23:59, 9, 21, 46, 122, 187, 706, 19..."
2,description,7769,[In a future where the elite inhabit an island...
3,cast,6832,"[João Miguel, Bianca Comparato, Michel Gomes, ..."
4,director,4050,"[nan, Jorge Michel Grau, Gilbert Chan, Shane A..."
5,date_added,1566,"[August 14, 2020, December 23, 2016, December ..."
6,country,682,"[Brazil, Mexico, Singapore, United States, Tur..."
7,listed_in,492,"[International TV Shows, TV Dramas, TV Sci-Fi ..."
8,duration,216,"[4 Seasons, 93 min, 78 min, 80 min, 123 min, 1..."
9,release_year,73,"[2020, 2016, 2011, 2009, 2008, 2019, 1997, 201..."


In [180]:
netflix.rating.unique().tolist()

['TV-MA',
 'R',
 'PG-13',
 'TV-14',
 'TV-PG',
 'NR',
 'TV-G',
 'TV-Y',
 nan,
 'TV-Y7',
 'PG',
 'G',
 'NC-17',
 'TV-Y7-FV',
 'UR']

In [177]:
netflix = netflix.drop('cast', axis=1)

In [183]:
'''
Nuestro target sería el rating. 
Primero tenemos que tratar (eliminar o sustituir) los NaN y nulos. De todas las columnas/filas que los tienen. Teniendo en vuenta que algunas columnas como cast no aportan mucha informacion porque casi todos los valores que tienen son unicos. 
Vemos que hay datos poco coherentes, como en titulos. Hay que ver como tratarlos. 
En listed_in tendriamos que expandir primero las listas de cada celda. 
Hacer el encoding y determinar la X y Y. Probar barios modelos para merjorar las metricas. 

'''




'\nNuestro target sería el rating. \nPrimero tenemos que tratar (eliminar o sustituir) los NaN y nulos. De todas las columnas/filas que los tienen. Teniendo en vuenta que algunas columnas como cast no aportan mucha informacion porque casi todos los valores que tienen son unicos. \nVemos que hay datos poco coherentes, como en titulos. Hay que ver como tratarlos. \nEn listed_in tendriamos que expandir primero las listas de cada celda. \nHacer el encoding y determinar la X y Y. Probar barios modelos para merjorar las metricas. \n\n'

---