# Aprendizaje no supervizado

In [60]:
import pandas as pd
import numpy as np
import sys
from itertools import combinations, groupby
from collections import Counter
from IPython.display import display
from sklearn.utils import shuffle
from efficient_apriori import apriori
pd.options.display.max_colwidth = 100
np.random.seed(22)

In [2]:
#Uploading data
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

In [3]:
def size(obj):
    return "{0:.2f} MB".format(sys.getsizeof(obj) / (1000 * 1000))

### EDA

This dataset (ml-20m) describes 5-star rating and free-text tagging activity from [MovieLens](http://movielens.org), a movie recommendation service. It contains 20000263 ratings and 465564 tag applications across 27278 movies. These data were created by 138493 users between January 09, 1995 and March 31, 2015. This dataset was generated on March 31, 2015, and updated on October 17, 2016 to update links.csv and add genome-* files.

In [4]:
print(movies.shape)
movies.head()

(27278, 3)


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
print(ratings.shape)
ratings.head()

(20000263, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


In [6]:
print(len(movies[movies.genres == '(no genres listed)'].index))

246


In [7]:
#Drop movies with non genres listed.
print(len(movies.index))
movies.drop(movies[movies.genres == '(no genres listed)'].index, inplace=True)
print(len(movies.index))

27278
27032


In [34]:
#Cortamos el dataset pued es demasiado grande, solo trabajamos con el 0,05 %! que contiene 100000 filas con 702 usuarios.
cuted = ratings.head(int(len(ratings)*0.005))
print('cuted -- dimensions: {0};   size: {1}'.format(cuted.shape, size(movies)))
display(cuted.head(5))
display(cuted.tail(5))

cuted -- dimensions: (100001, 4);   size: 4.68 MB


Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


Unnamed: 0,userId,movieId,rating,timestamp
99996,702,1374,2.5,1095136686
99997,702,1377,1.5,1095141779
99998,702,1378,1.5,1095139088
99999,702,1380,3.5,1095139491
100000,702,1387,3.0,1095136163


In [32]:
#Deecodificamos el nombre de la pelicula 
merged_df = pd.merge(cuted[['userId','movieId']], movies[['movieId','title']] ,on='movieId', how= "inner")
merged_df = merged_df.sort_values( by='userId', axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last')
display(merged_df.head(5))

merged = merged_df.values[:,[0,2]] # 0 es userId, 1 es movieId
print(len(merged))
print(merged[:3])

Unnamed: 0,userId,movieId,title
0,1,2,Jumanji (1995)
11345,1,5797,"Company of Wolves, The (1984)"
11348,1,5816,Harry Potter and the Chamber of Secrets (2002)
11420,1,5898,"Sword and the Sorcerer, The (1982)"
11422,1,5952,"Lord of the Rings: The Two Towers, The (2002)"


100001
[[1 'Jumanji (1995)']
 [1 'Company of Wolves, The (1984)']
 [1 'Harry Potter and the Chamber of Secrets (2002)']]


### Aplicamos reglas de asociación

In [33]:
transactions=[]
for movie_id, movie_object in groupby(merged, lambda x: x[0]):
    transactions.append([item[1] for item in movie_object])
print(transactions)

[['Jumanji (1995)', 'Company of Wolves, The (1984)', 'Harry Potter and the Chamber of Secrets (2002)', 'Sword and the Sorcerer, The (1982)', 'Lord of the Rings: The Two Towers, The (2002)', 'Heavy Metal 2000 (2000)', 'Last Unicorn, The (1982)', 'Ringu (Ring) (1998)', 'X2: X-Men United (2003)', '28 Days Later (2002)', 'Pirates of the Caribbean: The Curse of the Black Pearl (2003)', 'Underworld (2003)', 'Bubba Ho-tep (2002)', 'Videodrome (1983)', "Monty Python's The Meaning of Life (1983)", 'Wilder Napalm (1993)', 'Scary Movie 3 (2003)', 'Invasion of the Body Snatchers (1978)', 'Ring, The (2002)', 'Clash of the Titans (1981)', 'Time Machine, The (2002)', 'Vampire Hunter D: Bloodlust (Banpaia hantâ D) (2000)', 'Memento (2000)', 'Shrek (2001)', "One Flew Over the Cuckoo's Nest (1975)", 'Final Fantasy: The Spirits Within (2001)', 'Adventures of Baron Munchausen, The (1988)', "Bill & Ted's Excellent Adventure (1989)", 'Others, The (2001)', 'Wicker Man, The (1973)', 'Witches, The (1990)', 'Do

In [35]:
itemsets, rules = apriori(transactions, min_support=0.2,  min_confidence=0.3, max_length=3, verbosity=2)
len(rules)

Generating itemsets.
 Counting itemsets of length 1.
  Found 8225 candidate itemsets of length 1.
  Found 69 large itemsets of length 1.
    [('Terminator 2: Judgment Day (1991)',), ('Pulp Fiction (1994)',), ('Alien (1979)',), ('Blade Runner (1982)',), ('Sixth Sense, The (1999)',), ('Monty Python and the Holy Grail (1975)',), ('Usual Suspects, The (1995)',), ('Memento (2000)',), ("One Flew Over the Cuckoo's Nest (1975)",), ('Star Wars: Episode I - The Phantom Menace (1999)',), ('Indiana Jones and the Last Crusade (1989)',), ('Fight Club (1999)',), ('Terminator, The (1984)',), ('Lord of the Rings: The Fellowship of the Ring, The (2001)',), ('Seven (a.k.a. Se7en) (1995)',), ('E.T. the Extra-Terrestrial (1982)',), ('Star Wars: Episode V - The Empire Strikes Back (1980)',), ('Shawshank Redemption, The (1994)',), ('Lord of the Rings: The Return of the King, The (2003)',), ('Lord of the Rings: The Two Towers, The (2002)',), ('Star Wars: Episode IV - A New Hope (1977)',), ('Shrek (2001)',), (

  Found 140 large itemsets of length 2.
   [('Ace Ventura: Pet Detective (1994)', 'Apollo 13 (1995)'), ('Ace Ventura: Pet Detective (1994)', 'Forrest Gump (1994)'), ('Ace Ventura: Pet Detective (1994)', 'Jurassic Park (1993)'), ('Ace Ventura: Pet Detective (1994)', 'Pulp Fiction (1994)'), ('Aladdin (1992)', 'Forrest Gump (1994)'), ('Aladdin (1992)', 'Jurassic Park (1993)'), ('American Beauty (1999)', 'Fight Club (1999)'), ('American Beauty (1999)', 'Forrest Gump (1994)'), ('American Beauty (1999)', 'Matrix, The (1999)'), ('American Beauty (1999)', 'Pulp Fiction (1994)'), ('American Beauty (1999)', 'Silence of the Lambs, The (1991)'), ('Apollo 13 (1995)', 'Batman (1989)'), ('Apollo 13 (1995)', 'Braveheart (1995)'), ('Apollo 13 (1995)', 'Dances with Wolves (1990)'), ('Apollo 13 (1995)', 'Forrest Gump (1994)'), ('Apollo 13 (1995)', 'Fugitive, The (1993)'), ('Apollo 13 (1995)', 'Jurassic Park (1993)'), ('Apollo 13 (1995)', 'Pulp Fiction (1994)'), ('Apollo 13 (1995)', 'Shawshank Redemption,

442

In [36]:
rules=sorted(rules, key=lambda rule: rule.confidence)
print(rules[:3])

[{Pulp Fiction (1994)} -> {Silence of the Lambs, The (1991), Usual Suspects, The (1995)}, {Pulp Fiction (1994)} -> {Forrest Gump (1994), Fugitive, The (1993)}, {Pulp Fiction (1994)} -> {Forrest Gump (1994), Terminator 2: Judgment Day (1991)}]


In [37]:
rules_dict = [{'rule_lhs':x.lhs, 'rule_rhs':x.rhs, 'confidence':x.confidence, 'support':x.support, 'lift':x.lift, 'conviction':x.conviction} for x in rules]
print('Cantidad de relgas: ', len(rules_dict))
print(rules_dict[:2])

Cantidad de relgas:  442
[{'rule_lhs': ('Pulp Fiction (1994)',), 'rule_rhs': ('Silence of the Lambs, The (1991)', 'Usual Suspects, The (1995)'), 'confidence': 0.4057142857142857, 'support': 0.2022792022792023, 'lift': 1.8140855323020928, 'conviction': 1.3063636839731703}, {'rule_lhs': ('Pulp Fiction (1994)',), 'rule_rhs': ('Forrest Gump (1994)', 'Fugitive, The (1993)'), 'confidence': 0.4085714285714286, 'support': 0.2037037037037037, 'lift': 1.6113322632423754, 'conviction': 1.2620944966754986}]


In [63]:
rules_df = pd.DataFrame(rules_dict)
rules_df.head()

Unnamed: 0,rule_lhs,rule_rhs,confidence,support,lift,conviction
0,"(Pulp Fiction (1994),)","(Silence of the Lambs, The (1991), Usual Suspects, The (1995))",0.405714,0.202279,1.814086,1.306364
1,"(Pulp Fiction (1994),)","(Forrest Gump (1994), Fugitive, The (1993))",0.408571,0.203704,1.611332,1.262094
2,"(Pulp Fiction (1994),)","(Forrest Gump (1994), Terminator 2: Judgment Day (1991))",0.408571,0.203704,1.470857,1.221149
3,"(Pulp Fiction (1994),)","(Toy Story (1995),)",0.411429,0.205128,1.198435,1.115744
4,"(Pulp Fiction (1994),)","(Fugitive, The (1993), Jurassic Park (1993))",0.411429,0.205128,1.586939,1.25854


* Soporte: Nos indica la frecuencia relativa del itemset
* Confianza: Probabilidad empirica de que ocurra el consecuente dado que ocurrio el antecedente
* Lift: Refleja el aumento de la probabilidad de que ocurra el consecuente cuando nos enteramos de que ocurre el antecedente. Resuelve el problema de cuando la confianza es muy parecida a la probabilidad apriori del consecuente (numero de transacciones que contienen el consecuente / numero de transacciones totales).

    * mayor a 1: la probabilidad del consecuente aumenta una vez que sabemos que el consumidor compró el antecedente.
    * igual a 1: antecedente y consecuentes son eventos independientes, hay causalidad.
    * menor a 1: la ocurrencia del antecedente tuvo un efecto negativo en la ocurrencia del consecuente haciedno que baje su probabilidad. 

In [39]:
rules_df.describe()

Unnamed: 0,confidence,support,lift,conviction
count,442.0,442.0,442.0,442.0
mean,0.631733,0.224066,1.741088,2.008935
std,0.129278,0.024554,0.305292,1.175426
min,0.405714,0.200855,1.17,1.114164
25%,0.518186,0.206553,1.510009,1.407496
50%,0.633986,0.2151,1.711948,1.686928
75%,0.710871,0.233618,1.91485,2.089141
max,0.966443,0.346154,3.39352,16.980056


In [62]:
final_rules = rules_df[(rules_df.lift >= 2)]
final_rules.head(7)

Unnamed: 0,rule_lhs,rule_rhs,confidence,support,lift,conviction
49,"(Jurassic Park (1993),)","(Fugitive, The (1993), True Lies (1994))",0.470199,0.202279,2.075972,1.459989
51,"(Jurassic Park (1993),)","(Forrest Gump (1994), Independence Day (a.k.a. ID4) (1996))",0.47351,0.203704,2.103823,1.471877
57,"(Jurassic Park (1993),)","(Speed (1994), Terminator 2: Judgment Day (1991))",0.476821,0.205128,2.132029,1.483916
58,"(Jurassic Park (1993),)","(Terminator 2: Judgment Day (1991), True Lies (1994))",0.476821,0.205128,2.2465,1.505698
77,"(Jurassic Park (1993),)","(Batman (1989), Forrest Gump (1994))",0.486755,0.209402,2.149069,1.507086
78,"(Jurassic Park (1993),)","(Forrest Gump (1994), Speed (1994))",0.486755,0.209402,2.070921,1.490433
101,"(Jurassic Park (1993),)","(Forrest Gump (1994), True Lies (1994))",0.506623,0.217949,2.092053,1.536014
