### **LIBRERÍAS**

In [1]:
import numpy as np
import pandas as pd
import sqlite3 as sql
import plotly.graph_objs as go ### para gráficos
import plotly.express as px
#from mlxtend.preprocessing import TransactionEncoder
import matplotlib.pyplot as plt
import a_funciones as fn

### **CARGAR LOS DATOS**

In [2]:
conn=sql.connect('data\\db_movies') ### crear cuando no existe el nombre de cd  y para conectarse cuando sí existe.
cur=conn.cursor() ###para funciones que ejecutan sql en base de datos

In [3]:
cur.execute('select name from sqlite_master where type = "table"')
cur.fetchall()

[('ratings',),
 ('movies',),
 ('view_mov',),
 ('merge_ratings',),
 ('user_sel',),
 ('movies_sel',),
 ('ratings_filtered',),
 ('movies_final',),
 ('ratings_final',)]

**Nombrar las tablas**

In [4]:
movies = pd.read_sql("SELECT * from movies", conn)
ratings = pd.read_sql("SELECT * from ratings", conn)

In [5]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


Se tiene una primera tabla *movies*, la cual contiene información sobre las películas (título, año de lanzamineto y los géneros asociados a la misma)

In [6]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


Se tiene también, una segunda tabla, llamada *ratings*, la cual contiene información asociada a las calificaciones que ha obtenido una película y la cantidad de usuarios que han calificado la película.

**Separar el año en una nueva columna**

In [7]:
query = """
SELECT 
    movieId, 
    TRIM(SUBSTR(title, 1, INSTR(title, '(') - 1)) AS title,  -- Elimina el año del título
    SUBSTR(title, INSTR(title, '(') + 1, 4) AS year,         -- Extrae el año
    genres
FROM 
    movies;
"""

movies = pd.read_sql(query, conn)

In [8]:
movies

Unnamed: 0,movieId,title,year,genres
0,1,Toy Story,1995,Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji,1995,Adventure|Children|Fantasy
2,3,Grumpier Old Men,1995,Comedy|Romance
3,4,Waiting to Exhale,1995,Comedy|Drama|Romance
4,5,Father of the Bride Part II,1995,Comedy
...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic,2017,Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero,2017,Animation|Comedy|Fantasy
9739,193585,Flint,2017,Drama
9740,193587,Bungo Stray Dogs: Dead Apple,2018,Action|Animation


In [9]:
'''
genres=movies["genres"].str.split("|")
te = TransactionEncoder()
genres = te.fit_transform(genres)
genres = pd.DataFrame(genres, columns = te.columns_)
len(movies["genres"].unique())
'''

'\ngenres=movies["genres"].str.split("|")\nte = TransactionEncoder()\ngenres = te.fit_transform(genres)\ngenres = pd.DataFrame(genres, columns = te.columns_)\nlen(movies["genres"].unique())\n'

**Dummizar la columna de género, separando los carácteres contenidos en la misma.** 
Esta dummización se hace con el fin de poder analizar más fácilmente la base de datos

In [10]:
# Separar los géneros en columnas teniendo en cuenta el criterio de separación '|'
genres_dummies = movies['genres'].str.get_dummies(sep='|')

# Concatenar las columnas de géneros con el DataFrame original
movies_sep = pd.concat([movies, genres_dummies], axis=1)

In [11]:
movies_sep

Unnamed: 0,movieId,title,year,genres,(no genres listed),Action,Adventure,Animation,Children,Comedy,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story,1995,Adventure|Animation|Children|Comedy|Fantasy,0,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji,1995,Adventure|Children|Fantasy,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men,1995,Comedy|Romance,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale,1995,Comedy|Drama|Romance,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II,1995,Comedy,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic,2017,Action|Animation|Comedy|Fantasy,0,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
9738,193583,No Game No Life: Zero,2017,Animation|Comedy|Fantasy,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
9739,193585,Flint,2017,Drama,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9740,193587,Bungo Stray Dogs: Dead Apple,2018,Action|Animation,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


### **VISUALIZACIÓN DE LOS DATOS**

**Calificaciones generales**

In [12]:
cr = pd.read_sql("""
    SELECT 
        "rating" AS rating,
        COUNT(*) AS conteo,
        (COUNT(*) * 100.0 / SUM(COUNT(*)) OVER ()) AS porcentaje
    FROM ratings
    GROUP BY "rating"
    ORDER BY "rating"
""", conn)

cr

Unnamed: 0,rating,conteo,porcentaje
0,0.5,1370,1.358642
1,1.0,2811,2.787695
2,1.5,1791,1.776151
3,2.0,7551,7.488397
4,2.5,5550,5.503987
5,3.0,20047,19.880797
6,3.5,13136,13.027093
7,4.0,26818,26.59566
8,4.5,8551,8.480106
9,5.0,13211,13.101472


In [13]:
pd.read_sql("select count(*) from ratings", conn)

# Definir los colores según las calificaciones
colors = []
for rating in cr['rating']:
    if rating == 0.5:
        colors.append('#264653')  
    elif 1 <= rating <= 2:
        colors.append('#fe4a49')  
    elif 2.5 <= rating <= 3.5:
        colors.append('#fed766')  
    elif 4 <= rating <= 5:
        colors.append('#009fb7')  

data  = go.Bar( x=cr.rating,y=cr.conteo, text=cr.conteo, textposition="outside", marker_color=colors)

layout = go.Layout(
    title={
        'text': "Conteo de Calificaciones",
        'y': 0.94,
        'x': 0.5, 
        'xanchor': 'center',  # Anclar el título al centro
        'yanchor': 'top'
    },
    xaxis={
        'title': 'Calificación',
        'tickvals': cr['rating']  # Asegurar que todos los valores del eje X se muestren
    }, 
    yaxis={'title': 'Cantidad'},
    width=800,   # Ancho del gráfico
    height=600   # Alto del gráfico
)

# Crear la figura y mostrar
fig = go.Figure(data=data, layout=layout)
fig.show()

In [14]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


**Calificaciones por usuario**

In [15]:
rating_users=pd.read_sql(''' SELECT "userId" as user_id,
                         count(*) as cnt_rat
                         FROM ratings
                         group by "userId"
                         order by cnt_rat asc
                         ''',conn)

rating_users

Unnamed: 0,user_id,cnt_rat
0,53,20
1,147,20
2,189,20
3,194,20
4,207,20
...,...,...
605,274,1346
606,448,1864
607,474,2108
608,599,2478


In [16]:
fn.plot_histogram(rating_users, 'cnt_rat', bins=20, color='#264653')

*Según un nuevo estudio, una persona promedio pasa más de 78.000 horas frente al televisor a lo largo de su vida. Los investigadores determinaron que los espectadores ven un promedio de 3.639 películas y 31.507 episodios de televisión, lo que equivale a unas asombrosas 78.705 horas de televisión por hora.*

Filtro para ver los usuarios que han visto más de X película|s y menos de 1000

In [17]:
rating_users2=pd.read_sql(''' select "userId" as user_id,
                         count(*) as cnt_rat
                         FROM ratings
                         group by "userId"
                         having cnt_rat >=20 and cnt_rat <=1000
                         order by cnt_rat asc
                         ''',conn )

rating_users2

Unnamed: 0,user_id,cnt_rat
0,53,20
1,147,20
2,189,20
3,194,20
4,207,20
...,...,...
593,177,904
594,298,939
595,603,943
596,307,975


In [18]:
fn.plot_histogram(rating_users2, 'cnt_rat', bins=20, color='#264653')

In [19]:
#### verificar cuantas calificaciones tiene cada película
rating_movies=pd.read_sql(''' select movieId ,
                         count(*) as cnt_rat
                         from ratings
                         group by "movieId"
                         order by cnt_rat desc
                         ''',conn )

rating_movies

Unnamed: 0,movieId,cnt_rat
0,356,329
1,318,317
2,296,307
3,593,279
4,2571,278
...,...,...
9719,96,1
9720,83,1
9721,77,1
9722,55,1


In [20]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [21]:
rating_movies[rating_movies['cnt_rat']==1]

Unnamed: 0,movieId,cnt_rat
6278,193609,1
6279,193587,1
6280,193585,1
6281,193583,1
6282,193581,1
...,...,...
9719,96,1
9720,83,1
9721,77,1
9722,55,1


In [22]:
fn.plot_histogram(rating_movies, 'cnt_rat', bins=20, color='#264653')

In [23]:
#### verificar cuantas calificaciones tiene cada película
rating_movies2=pd.read_sql(''' select movieId ,
                         count(*) as cnt_rat
                         from ratings
                         group by "movieId"
                         having cnt_rat >= 20
                         order by cnt_rat desc
                         ''',conn )

rating_movies2

Unnamed: 0,movieId,cnt_rat
0,356,329
1,318,317
2,296,307
3,593,279
4,2571,278
...,...,...
1292,319,20
1293,308,20
1294,175,20
1295,69,20


In [24]:
fn.plot_histogram(rating_movies2, 'cnt_rat', bins=20, color='#264653')

In [25]:
ratings_list = np.arange(0.5, 5.5, 0.5)

# Contar cuántas veces calificó cada usuario
rating_counts = ratings.groupby('userId')['rating'].value_counts().unstack(fill_value=0)

# Reindexar para asegurarse de que todas las calificaciones estén presentes
rating_counts = rating_counts.reindex(columns=ratings_list, fill_value=0)

# Añadir la columna de total de calificaciones
rating_counts['Total'] = rating_counts.sum(axis=1)

rating_counts = rating_counts.sort_values(by='Total', ascending=False)

In [26]:
rating_counts

rating,0.5,1.0,1.5,2.0,2.5,3.0,3.5,4.0,4.5,5.0,Total
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
414,1,40,20,398,122,658,232,903,76,248,2698
599,42,67,226,349,690,624,290,122,36,32,2478
474,9,36,37,173,103,383,580,569,159,59,2108
448,26,85,164,334,180,482,138,337,43,75,1864
274,2,15,37,118,139,246,472,243,63,11,1346
...,...,...,...,...,...,...,...,...,...,...,...
442,6,5,3,4,2,0,0,0,0,0,20
569,0,0,0,0,0,5,0,10,0,5,20
320,1,0,0,0,0,2,8,9,0,0,20
576,0,3,2,1,2,2,1,4,3,2,20


In [27]:
# Crear la nueva matriz binaria
binary_matrix = (rating_counts.iloc[:, :-1] >= 1).astype(int)

# Calcular la columna Total en la matriz binaria
binary_matrix['Total'] = binary_matrix.sum(axis=1)

# Mostrar la nueva matriz con Total
binary_matrix

rating,0.5,1.0,1.5,2.0,2.5,3.0,3.5,4.0,4.5,5.0,Total
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
414,1,1,1,1,1,1,1,1,1,1,10
599,1,1,1,1,1,1,1,1,1,1,10
474,1,1,1,1,1,1,1,1,1,1,10
448,1,1,1,1,1,1,1,1,1,1,10
274,1,1,1,1,1,1,1,1,1,1,10
...,...,...,...,...,...,...,...,...,...,...,...
442,1,1,1,1,1,0,0,0,0,0,5
569,0,0,0,0,0,1,0,1,0,1,3
320,1,0,0,0,0,1,1,1,0,0,4
576,0,1,1,1,1,1,1,1,1,1,9


In [28]:
id_binary = binary_matrix.index.tolist()

In [29]:
binary_total = binary_matrix['Total'].tolist()

In [30]:
# Unión de ID con Total
df_binary = pd.DataFrame()
df_binary['userId'] = id_binary
df_binary['Total_ranges'] = binary_total
df_binary

Unnamed: 0,userId,Total_ranges
0,414,10
1,599,10
2,474,10
3,448,10
4,274,10
...,...,...
605,442,5
606,569,3
607,320,4
608,576,9


In [31]:
# Pegar total de 'df_binary' en 'rating_counts' según 'userId'
df_merged_ratings = pd.merge(rating_counts,df_binary, on=['userId'], how='outer')
df_merged_ratings.sort_values(by='Total', ascending=False)

Unnamed: 0,userId,0.5,1.0,1.5,2.0,2.5,3.0,3.5,4.0,4.5,5.0,Total,Total_ranges
413,414,1,40,20,398,122,658,232,903,76,248,2698,10
598,599,42,67,226,349,690,624,290,122,36,32,2478,10
473,474,9,36,37,173,103,383,580,569,159,59,2108,10
447,448,26,85,164,334,180,482,138,337,43,75,1864,10
273,274,2,15,37,118,139,246,472,243,63,11,1346,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...
441,442,6,5,3,4,2,0,0,0,0,0,20,5
568,569,0,0,0,0,0,5,0,10,0,5,20,3
319,320,1,0,0,0,0,2,8,9,0,0,20,4
575,576,0,3,2,1,2,2,1,4,3,2,20,9


In [32]:
# Contar cuántos usuarios tienen cada total
total_counts = binary_matrix['Total'].value_counts().sort_index()

# Graficar
fig = go.Figure()

fig.add_trace(go.Bar(
    x=total_counts.index,  # Total de calificaciones
    y=total_counts.values,  # Número de usuarios
    marker=dict(color='#264653')
))

# Personalizar el diseño del gráfico
fig.update_layout(
    title='Cantidad de Usuarios por Total de Calificaciones',
    xaxis_title='Total de Calificaciones',
    yaxis_title='Número de Usuarios',
    width=800,
    height=600,
)

# Mostrar el gráfico
fig.show()

In [33]:
# Definir los bins manualmente
bins = [0, 2, 4, 6, 11]  # Bins para los rangos (1-2), (3-4), (5-6), (+6)
labels = ['1-2', '3-4', '5-6', '+6']  # Etiquetas para los bins

# Agrupar los totales en los bins definidos
total_binned = pd.cut(total_counts.index, bins=bins, labels=labels, right=True)

# Contar cuántos usuarios caen en cada bin
binned_counts = total_counts.groupby(total_binned).sum().reindex(labels, fill_value=0)

# Graficar
fig = go.Figure()

fig.add_trace(go.Bar(
    x=binned_counts.index,  # Total de calificaciones en bins
    y=binned_counts.values,  # Número de usuarios en cada bin
    marker=dict(color='#264653')
))

# Personalizar el diseño del gráfico
fig.update_layout(
    title='Cantidad de Usuarios por Total de Calificaciones (Binned)',
    xaxis_title='Total de Calificaciones (bins)',
    yaxis_title='Número de Usuarios',
    width=800,
    height=600,
)

# Mostrar el gráfico
fig.show()





## **Preprocesamiento**

In [34]:
fn.ejecutar_sql('preprocesamiento.sql', cur)

In [35]:
cur.execute("select name from sqlite_master where type='table' ")
cur.fetchall()

[('ratings',),
 ('movies',),
 ('view_mov',),
 ('merge_ratings',),
 ('user_sel',),
 ('movies_sel',),
 ('ratings_filtered',),
 ('movies_final',),
 ('ratings_final',)]

In [36]:
movies

Unnamed: 0,movieId,title,year,genres
0,1,Toy Story,1995,Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji,1995,Adventure|Children|Fantasy
2,3,Grumpier Old Men,1995,Comedy|Romance
3,4,Waiting to Exhale,1995,Comedy|Drama|Romance
4,5,Father of the Bride Part II,1995,Comedy
...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic,2017,Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero,2017,Animation|Comedy|Fantasy
9739,193585,Flint,2017,Drama
9740,193587,Bungo Stray Dogs: Dead Apple,2018,Action|Animation


In [37]:
pd.read_sql('select count(*) movies_final', conn)
movies_final =pd.read_sql('select * from  movies_final',conn)
movies_final

Unnamed: 0,movieId,title,genres,movieId:1,cnt_rat
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,215
1,2,Jumanji (1995),Adventure|Children|Fantasy,2,110
2,3,Grumpier Old Men (1995),Comedy|Romance,3,52
3,5,Father of the Bride Part II (1995),Comedy,5,49
4,6,Heat (1995),Action|Crime|Thriller,6,102
...,...,...,...,...,...
1292,148626,"Big Short, The (2015)",Drama,148626,26
1293,152081,Zootopia (2016),Action|Adventure|Animation|Children|Comedy,152081,32
1294,164179,Arrival (2016),Sci-Fi,164179,26
1295,166528,Rogue One: A Star Wars Story (2016),Action|Adventure|Fantasy|Sci-Fi,166528,27


In [38]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [39]:
pd.read_sql('select count(*) ratings_filtered', conn)
ratings_filtered =pd.read_sql('select * from  ratings_filtered',conn)
ratings_filtered

Unnamed: 0,user_id,movie_id,movie_rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
67893,610,148626,4.0,1493847175
67894,610,152081,4.0,1493846503
67895,610,164179,5.0,1493845631
67896,610,166528,4.0,1493879365


In [40]:
pd.read_sql('select count(*) ratings_final', conn)
df_final = pd.read_sql('select * from  ratings_final',conn)
df_final

Unnamed: 0,user_id,movie_id,movie_rating,timestamp,movieId,title,genres,movieId:1,cnt_rat
0,1,1,4.0,964982703,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,215
1,1,3,4.0,964981247,3,Grumpier Old Men (1995),Comedy|Romance,3,52
2,1,6,4.0,964982224,6,Heat (1995),Action|Crime|Thriller,6,102
3,1,47,5.0,964983815,47,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,47,203
4,1,50,5.0,964982931,50,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,50,204
...,...,...,...,...,...,...,...,...,...
67190,610,148626,4.0,1493847175,148626,"Big Short, The (2015)",Drama,148626,26
67191,610,152081,4.0,1493846503,152081,Zootopia (2016),Action|Adventure|Animation|Children|Comedy,152081,32
67192,610,164179,5.0,1493845631,164179,Arrival (2016),Sci-Fi,164179,26
67193,610,166528,4.0,1493879365,166528,Rogue One: A Star Wars Story (2016),Action|Adventure|Fantasy|Sci-Fi,166528,27


In [41]:
df_final.drop('movie_id', axis=1, inplace=True)
df_final.drop('movieId:1', axis=1, inplace=True)

In [45]:
query = """SELECT 
    TRIM(SUBSTRING(title, 1, LENGTH(title) - 7)) AS movie_name,  -- Extrae el nombre de la película
    SUBSTRING(title, LENGTH(title) - 4, 4) AS movie_year          -- Extrae el año
FROM movies;"""

df_final = pd.read_sql(query, conn)

In [46]:
df_final.head()

Unnamed: 0,movie_name,movie_year
0,Toy Story,1995
1,Jumanji,1995
2,Grumpier Old Men,1995
3,Waiting to Exhale,1995
4,Father of the Bride Part II,1995
