In [1]:
import os
import pandas as pd
import numpy as np
import random as rd

In [6]:
!unzip -o archive_netflix.zip -d archive_netflix

Archive:  archive_netflix.zip
  inflating: archive_netflix/README  
  inflating: archive_netflix/combined_data_1.txt  
  inflating: archive_netflix/combined_data_2.txt  
  inflating: archive_netflix/combined_data_3.txt  
  inflating: archive_netflix/combined_data_4.txt  
  inflating: archive_netflix/movie_titles.csv  
  inflating: archive_netflix/probe.txt  
  inflating: archive_netflix/qualifying.txt  


In [3]:


# Directory containing the .txt files
input_directory = "archive_netflix"

# Function to process each file
def process_file(input_file, output_dict):
    with open(input_file, 'r') as file:
        current_movie_id = None
        for line in file:
            try:
                if line.endswith(':\n'):
                    current_movie_id = line.strip()[:-1]
                else:
                    user_id, rating, _ = line.split(',')
                    if user_id not in output_dict:
                        output_dict[user_id] = []
                    output_dict[user_id].append(f"{current_movie_id},{rating.strip()}")
            except Exception as e:
                print(f"Error processing line: {line.strip()} in file {input_file}")
                print(e)

# Aggregate data from all files
output_dict = {}

for filename in os.listdir(input_directory):
    if filename.endswith('.txt') and filename != "combined_data_4.txt" and filename != "probe.txt" and filename != "qualifying.txt": # solo utilizamos dataset 1, 2 y 3
        print(f"Processing file: {filename}")
        process_file(os.path.join(input_directory, filename), output_dict)
        print(f"Finished processing file: {filename}")


Processing file: netflix_reformatted_data.txt
Finished processing file: netflix_reformatted_data.txt
Processing file: combined_data_1.txt
Finished processing file: combined_data_1.txt
Processing file: combined_data_3.txt
Finished processing file: combined_data_3.txt
Processing file: combined_data_2.txt
Finished processing file: combined_data_2.txt


In [4]:
output_file = 'archive_netflix/netflix_reformatted_data.txt'
# Write the reformatted data to a new file
with open(output_file, 'w') as file:
    for user_id, movies in output_dict.items():
        file.write(f"{user_id}:\n")
        for movie in movies:
            file.write(f"{movie}\n")

print(f"Data has been reformatted and saved to {output_file}")

Data has been reformatted and saved to archive_netflix/netflix_reformatted_data.txt


In [2]:
reformatted_data_path = "netflix_reformatted_data.txt"

reformatted_data = {}

#write new file with format user_id, movie_id, rating
with open(reformatted_data_path, 'r') as file:
    for line in file:
        if line.endswith(':\n'):
            user_id = line.strip()[:-1]
            reformatted_data[user_id] = []
        else:
            movie_id, rating = line.split(',')
            reformatted_data[user_id].append((movie_id, rating.strip()))



#order the data by number of ratings per user
sorted_data = sorted(reformatted_data.items(), key=lambda x: len(x[1]), reverse=True)
print(sorted_data[0:10])

[('305344', [('1', '1'), ('2', '1'), ('3', '2'), ('4', '1'), ('5', '1'), ('6', '1'), ('7', '2'), ('8', '1'), ('9', '1'), ('10', '1'), ('11', '1'), ('12', '4'), ('14', '1'), ('15', '5'), ('16', '1'), ('17', '2'), ('18', '2'), ('19', '1'), ('20', '1'), ('21', '1'), ('22', '1'), ('23', '1'), ('24', '1'), ('25', '1'), ('26', '1'), ('27', '1'), ('28', '3'), ('29', '3'), ('30', '1'), ('31', '4'), ('32', '1'), ('33', '1'), ('34', '1'), ('35', '3'), ('36', '1'), ('37', '1'), ('38', '1'), ('39', '1'), ('40', '1'), ('41', '1'), ('42', '1'), ('43', '1'), ('44', '1'), ('45', '3'), ('46', '1'), ('47', '2'), ('48', '2'), ('49', '3'), ('50', '2'), ('52', '1'), ('53', '1'), ('54', '1'), ('55', '3'), ('56', '1'), ('57', '2'), ('58', '2'), ('59', '1'), ('60', '3'), ('61', '3'), ('62', '2'), ('63', '1'), ('64', '1'), ('65', '1'), ('66', '3'), ('67', '1'), ('68', '1'), ('69', '1'), ('70', '2'), ('71', '2'), ('72', '4'), ('73', '1'), ('74', '1'), ('75', '1'), ('76', '1'), ('77', '2'), ('78', '4'), ('79', '

In [11]:
#creamos carpeta netflix
!mkdir -p netflix

In [12]:
#write the data to a new file with the format user_id, movie_id, rating
output_file = 'netflix/netflix.txt'
with open(output_file, 'w') as file:
    for user_id, movies in reformatted_data.items():
        for movie in movies:
            file.write(f"{user_id},{movie[0]},{movie[1]}\n")


### Calcular Popularidad de ítems

In [5]:
def calculate_item_popularity(df2):
    df = df2.copy()
    # Calcular la cantidad total de usuarios
    total_users = df['user_id'].nunique()

    # Contar la cantidad de usuarios únicos que compraron cada item
    item_user_count = df.groupby('item_id')['user_id'].nunique().reset_index()

    # Renombrar la columna user_id a popularity
    item_user_count = item_user_count.rename(columns={'user_id': 'popularity'})

    # Calcular la popularidad dividiendo por la cantidad total de usuarios
    item_user_count['popularity'] = item_user_count['popularity'] / total_users

    return item_user_count

### Calcular ítems más populares (top 20%)

In [6]:
def get_top_20_percent_items(popularity_df):
    # Ordenar los items por popularidad de mayor a menor
    sorted_popularity_df = popularity_df.sort_values(by='popularity', ascending=False)
    # Calcular el número de ítems que corresponde al 20%
    top_20_percent_count = int(len(sorted_popularity_df) * 0.20)
    # Obtener los ítems más populares que corresponden al 20%
    I_pop = sorted_popularity_df.head(top_20_percent_count)['item_id'].tolist()
    return I_pop

### Calcular popularidad de users

In [7]:
def calculate_user_popularity(df2, I_pop):
    df = df2.copy()
    # Filtrar los items rateados que están en I_pop
    df['is_popular'] = df['item_id'].isin(I_pop)

    # Calcular la cantidad de items populares rateados por cada usuario
    user_popular_items_count = df[df['is_popular']].groupby('user_id')['item_id'].count().reset_index()
    user_popular_items_count = user_popular_items_count.rename(columns={'item_id': 'popular_items_count'})

    # Calcular la cantidad total de items rateados por cada usuario
    user_total_items_count = df.groupby('user_id')['item_id'].count().reset_index()
    user_total_items_count = user_total_items_count.rename(columns={'item_id': 'total_items_count'})

    # Asegurar que todos los usuarios están presentes en el resultado final
    user_popular_items_count = pd.merge(user_total_items_count[['user_id']],
                                        user_popular_items_count,
                                        on='user_id',
                                        how='left').fillna(0)

    # Combinar los dos DataFrames
    user_popularity_df = pd.merge(user_popular_items_count, user_total_items_count, on='user_id')

    # Calcular user_pop dividiendo popular_items_count por total_items_count
    user_popularity_df['user_pop'] = user_popularity_df['popular_items_count'] / user_popularity_df['total_items_count']

    # Seleccionar solo las columnas user_id y user_pop
    user_popularity_df = user_popularity_df[['user_id', 'user_pop']]

    return user_popularity_df

Seleccionamos aleatoreamente 3000 usuarios y los dividimos en top, mid and bottom

In [14]:
# Cargar el dataset de Netflix
df_netflix = pd.read_csv('netflix/netflix.txt', sep=",", header=None, names=['user_id', 'item_id', 'rating'])

# Seleccionar las columnas relevantes
df_netflix = df_netflix[['user_id', 'item_id', 'rating']]
df_netflix['rating'] = df_netflix['rating'].astype(float)

#take only 3000 users

users = df_netflix['user_id'].unique()
rd.shuffle(users)
users = users[:3000]
df_netflix = df_netflix[df_netflix['user_id'].isin(users)]

# Calcular la popularidad de los ítems
popularity_df = calculate_item_popularity(df_netflix)

# Obtener los ítems del top 20% de popularidad
I_pop = get_top_20_percent_items(popularity_df)

# Calcular la popularidad de los usuarios
user_popularity_df = calculate_user_popularity(df_netflix, I_pop)
user_popularity_df = user_popularity_df.sort_values(by='user_pop', ascending=False)

# Separar usuarios en top, medium y bottom popularity
top, mid, bot = np.array_split(user_popularity_df, 3)

# Mostrar resultados
print("Top 10 usuarios de popularidad alta:\n", top.head(10))
print("Top 10 usuarios de popularidad media:\n", mid.head(10))
print("Top 10 usuarios de popularidad baja:\n", bot.head(10))

Top 10 usuarios de popularidad alta:
       user_id  user_pop
2990  2636931       1.0
0         248       1.0
2984  2632909       1.0
2939  2588429       1.0
2941  2590437       1.0
2942  2591426       1.0
2945  2594211       1.0
2932  2584937       1.0
2937  2587885       1.0
2923  2575096       1.0
Top 10 usuarios de popularidad media:
       user_id  user_pop
18      16891  0.963964
1016   888578  0.963636
592    502450  0.963636
670    561508  0.963636
151    130034  0.963470
438    378243  0.963455
1935  1713572  0.963415
2665  2343872  0.963351
1709  1501090  0.963190
2798  2465214  0.962963
Top 10 usuarios de popularidad baja:
       user_id  user_pop
1235  1101357  0.884993
1666  1465417  0.884848
2439  2166014  0.884615
1211  1073371  0.884615
170    145499  0.884615
2964  2612695  0.884615
1180  1050889  0.884615
722    607215  0.884615
2195  1936338  0.884615
375    321341  0.884615


  return bound(*args, **kwds)


In [15]:
top.to_csv('./netflix/top.csv', index=False)
mid.to_csv('./netflix/mid.csv', index=False)
bot.to_csv('./netflix/bot.csv', index=False)
df_netflix.to_csv('./netflix/netflix.csv', index=False)