# Import necessary dependencies and settings

In [30]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import scipy.stats as spstats

%matplotlib inline
mpl.style.reload_library()
mpl.style.use('classic')
mpl.rcParams['figure.facecolor'] = (1, 1, 1, 0)
mpl.rcParams['figure.figsize'] = [6.0, 4.0]
mpl.rcParams['figure.dpi'] = 100

# Raw Measures

## Values

In [31]:
# Lee Pokemon.csv en un DataFrame
pokemon = pd.read_csv('./data/Pokemon.csv', encoding='latin1')
pokemon.head()

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Stage,Legendary
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,2,False
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,3,False
3,4,Charmander,Fire,,309,39,52,43,60,50,65,1,False
4,5,Charmeleon,Fire,,405,58,64,58,80,65,80,2,False


In [32]:
# Muestra las columnas HP, Attack y Defense
pokemon[['HP', 'Attack', 'Defense']]

Unnamed: 0,HP,Attack,Defense
0,45,49,49
1,60,62,63
2,80,82,83
3,39,52,43
4,58,64,58
...,...,...,...
146,41,64,45
147,61,84,65
148,91,134,95
149,106,110,90


In [33]:
# Muestra una descripción de esas columnas
pokemon[['HP', 'Attack', 'Defense']].describe()

Unnamed: 0,HP,Attack,Defense
count,151.0,151.0,151.0
mean,64.211921,72.549669,68.225166
std,28.590117,26.596162,26.916704
min,10.0,5.0,5.0
25%,45.0,51.0,50.0
50%,60.0,70.0,65.0
75%,80.0,90.0,84.0
max,250.0,134.0,180.0


## Counts

Load the song_views.csv dataset and understand the features.

In [34]:
# Lee song_views.csv y visualízalo en un DataFrame
songs = pd.read_csv('./data/song_views.csv')
songs.head()

Unnamed: 0,user_id,song_id,title,listen_count
0,b6b799f34a204bd928ea014c243ddad6d0be4f8f,SOBONKR12A58A7A7E0,You're The One,2
1,b41ead730ac14f6b6717b9cf8859d5579f3f8d4d,SOBONKR12A58A7A7E0,You're The One,0
2,4c84359a164b161496d05282707cecbd50adbfc4,SOBONKR12A58A7A7E0,You're The One,0
3,779b5908593756abb6ff7586177c966022668b06,SOBONKR12A58A7A7E0,You're The One,0
4,dd88ea94f605a63d9fc37a214127e3f00e85e42d,SOBONKR12A58A7A7E0,You're The One,0


# Binarization

Often raw frequencies or counts may not be relevant for building a model based on the problem which is being solved. For instance if I’m building a recommendation system for song recommendations, I would just want to know if a person is interested or has listened to a particular song. This doesn’t require the number of times a song has been listened to since I am more concerned about the various songs he\she has listened to. In this case, a binary feature is preferred as opposed to a count based feature. Add a column that includes this information, with a new column watched, that takes the value 1, when the listen count is >0


In [35]:
def watched(x):
    if x > 0:
        return True
    else:
        return False

In [36]:
# en el DataFrame de canciones, añade una columna que indique con el valor 1 si esa canción se ha escuchado alguna vez
songs['watched1'] = songs['listen_count'].apply(bool).apply(int)

In [37]:
# Muestra un head para ver tus resultados
songs.head()

Unnamed: 0,user_id,song_id,title,listen_count,watched1
0,b6b799f34a204bd928ea014c243ddad6d0be4f8f,SOBONKR12A58A7A7E0,You're The One,2,1
1,b41ead730ac14f6b6717b9cf8859d5579f3f8d4d,SOBONKR12A58A7A7E0,You're The One,0,0
2,4c84359a164b161496d05282707cecbd50adbfc4,SOBONKR12A58A7A7E0,You're The One,0,0
3,779b5908593756abb6ff7586177c966022668b06,SOBONKR12A58A7A7E0,You're The One,0,0
4,dd88ea94f605a63d9fc37a214127e3f00e85e42d,SOBONKR12A58A7A7E0,You're The One,0,0


## Binarization with sklearn

Look at the documentation of sklearn preprecessing. Specifically to the Binarizer method. Try to use this method to obtainn a binarization of the song_views dataset.

In [38]:
# Busca documentación sobre el preprocesado de sklearn (en concreto, Binarizer)
from sklearn.preprocessing import Binarizer

# Binarizer tiene la frontera en 0 por defecto, pero lo ponemos para practicar
my_binarizer = Binarizer(threshold=0)
songs['bin_res'] = my_binarizer.transform(songs[['watched4']])
songs.head()

KeyError: "None of [Index(['watched4'], dtype='object')] are in the [columns]"

# Rounding

Load the item_popularity.csv dataset and understand the features.

In [39]:
item_popularity = pd.read_csv('./data/item_popularity.csv')
item_popularity.head()

Unnamed: 0,item_id,pop_percent
0,it_01345,0.98324
1,it_03431,0.56123
2,it_04572,0.12098
3,it_98021,0.35476
4,it_01298,0.92101


Include new columns in the dataset showing a popularity scale of 100 and 1000, being those 2 columns integer numbers.

In [40]:
item_popularity['pop_100'] = item_popularity['pop_percent']*100
item_popularity['pop_1000'] = item_popularity['pop_percent']*1000
item_popularity.head()

Unnamed: 0,item_id,pop_percent,pop_100,pop_1000
0,it_01345,0.98324,98.324,983.24
1,it_03431,0.56123,56.123,561.23
2,it_04572,0.12098,12.098,120.98
3,it_98021,0.35476,35.476,354.76
4,it_01298,0.92101,92.101,921.01


# Interactions

Load the pokemon dataset. Build a new data set including only 'Attack' and 'Defense'.

In [41]:
pokemon_df = pd.read_csv('./data/Pokemon.csv', encoding='latin-1')
pokemon_df.head()

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Stage,Legendary
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,2,False
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,3,False
3,4,Charmander,Fire,,309,39,52,43,60,50,65,1,False
4,5,Charmeleon,Fire,,405,58,64,58,80,65,80,2,False


In [42]:
pokemon_df_ad = pokemon_df[['Attack', 'Defense']].copy()

In [43]:
pokemon_df_ad.head()

Unnamed: 0,Attack,Defense
0,49,49
1,62,63
2,82,83
3,52,43
4,64,58


Build a new dataframe using the PolynomialFeatures method in sklearn.preprocesing. Use a degree 2 polynomic function. Try to understand what is happening.

In [44]:
from sklearn.preprocessing import PolynomialFeatures
pol_df = pd.DataFrame({'C1': [4,7,6], 'C2':[10,3,8]})
pol_df

Unnamed: 0,C1,C2
0,4,10
1,7,3
2,6,8


In [46]:
# Cada columna eleva cada fila a 0.1.2 y el producto de ambas
poly = PolynomialFeatures(2, interaction_only=True)
poly.fit_transform(pol_df)

array([[ 1.,  4., 10., 40.],
       [ 1.,  7.,  3., 21.],
       [ 1.,  6.,  8., 48.]])

In [47]:
poly.get_feature_names()

['1', 'x0', 'x1', 'x0 x1']

In [18]:
# Lo que estamos calculando es el Ataque x Defensa, es decir, una medida de fortaleza del pokemon
pokemon_df['Atck*Def'] = pokemon_df['Attack'] * pokemon_df['Defense']
pokemon_df.head(100)

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Stage,Legendary,Atck*Def
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False,2401
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,2,False,3906
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,3,False,6806
3,4,Charmander,Fire,,309,39,52,43,60,50,65,1,False,2236
4,5,Charmeleon,Fire,,405,58,64,58,80,65,80,2,False,3712
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,96,Drowzee,Psychic,,328,60,48,45,43,90,42,1,False,2160
96,97,Hypno,Psychic,,483,85,73,70,73,115,67,2,False,5110
97,98,Krabby,Water,,325,30,105,90,25,25,50,1,False,9450
98,99,Kingler,Water,,475,55,130,115,50,50,75,2,False,14950
