In [1]:
import pandas as pd
from google.colab import drive
drive.mount('/content/drive', force_remount = True)
PATH_OF_MOVIES = '/content/drive/MyDrive/movies_samples.csv'

# File paths
file_path = PATH_OF_MOVIES
like_file_path = 'like.txt'
dislike_file_path = 'dislike.txt'
train_file_path = 'train_set.txt'
val_file_path = 'valid_set.txt'
test_file_path = 'test_set.txt'

# Step 1: Process ratings into like/dislike categories
like = []
dislike = []

with open(file_path, 'r', encoding='ISO-8859-1') as f:
    next(f)
    for line in f:
        items = line.strip().split('\t')
        if len(items) < 5:
            continue
        user_id = items[0]
        movie_id = items[4]
        rating = float(items[1])

        new_line = f"{user_id},{movie_id},{rating}\n"
        print(f"New Line: {new_line.strip()}")
        print(f"User: {user_id}, Movie: {movie_id}, Rating: {rating}")
        if rating < 4:
            if rating < 3:
                dislike.append(new_line)
            continue
        like.append(new_line)

print(f"Number of likes: {len(like)}")
print(f"Number of dislikes: {len(dislike)}")
with open(like_file_path, 'w') as f:
    f.writelines(like)

with open(dislike_file_path, 'w') as f:
    f.writelines(dislike)


df_like = pd.read_csv(like_file_path, header=None, names=['u', 'i', 'r'])
sorted_data = df_like.sort_values(by=['u'])  # Sort by user ID

train_set = pd.DataFrame(columns=['u', 'i', 'r'])
val_set = pd.DataFrame(columns=['u', 'i', 'r'])
test_set = pd.DataFrame(columns=['u', 'i', 'r'])

for user_id, group in sorted_data.groupby('u'):
    print(f"User {user_id} - Number of ratings: {len(group)}")
    num_ratings = len(group)
    if num_ratings < 3:
        continue  # Skip users with less than 3 ratings
    test_set = pd.concat([test_set, group.iloc[[num_ratings - 1]]])  # Last interaction for testing
    val_set = pd.concat([val_set, group.iloc[[num_ratings - 2]]])  # Second-to-last for validation
    train_set = pd.concat([train_set, group.iloc[:num_ratings - 2]])  # Rest for training

# Step 4: Save the datasets to files
train_set.reset_index(drop=True, inplace=True)
val_set.reset_index(drop=True, inplace=True)
test_set.reset_index(drop=True, inplace=True)


train_set.to_csv(train_file_path, index=False, header=None, sep=' ')
val_set.to_csv(val_file_path, index=False, header=None, sep=' ')
test_set.to_csv(test_file_path, index=False, header=None, sep=' ')

# Print summaries
print("（Training set）:")
print(train_set)
print("\n（Validation set）:")
print(val_set)
print("\n（Test set）:")
print(test_set)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
New Line: A3V91DFBU5OFAI,0790746999,5.0
User: A3V91DFBU5OFAI, Movie: 0790746999, Rating: 5.0
New Line: A3V91DFBU5OFAI,B0000696ID,5.0
User: A3V91DFBU5OFAI, Movie: B0000696ID, Rating: 5.0
New Line: A3V91DFBU5OFAI,B0000696IE,5.0
User: A3V91DFBU5OFAI, Movie: B0000696IE, Rating: 5.0
New Line: A3V91DFBU5OFAI,6304143192,5.0
User: A3V91DFBU5OFAI, Movie: 6304143192, Rating: 5.0
New Line: A3V91DFBU5OFAI,B000069AT7,5.0
User: A3V91DFBU5OFAI, Movie: B000069AT7, Rating: 5.0
New Line: A3V91DFBU5OFAI,B000069AT8,5.0
User: A3V91DFBU5OFAI, Movie: B000069AT8, Rating: 5.0
New Line: A3V91DFBU5OFAI,B00008FEEG,5.0
User: A3V91DFBU5OFAI, Movie: B00008FEEG, Rating: 5.0
New Line: A3V91DFBU5OFAI,B00005BK3M,5.0
User: A3V91DFBU5OFAI, Movie: B00005BK3M, Rating: 5.0
New Line: A3VHI0VC0ZWEDX,B008LCRA98,5.0
User: A3VHI0VC0ZWEDX, Movie: B008LCRA98, Rating: 5.0
New Line: A3VHI0VC0ZWEDX,B000OIOOVO,5.0
User: A3VHI0VC0ZWEDX, Movie: B000OIOOVO, Rating: 5.0
New L

  test_set = pd.concat([test_set, group.iloc[[num_ratings - 1]]])  # Last interaction for testing
  val_set = pd.concat([val_set, group.iloc[[num_ratings - 2]]])  # Second-to-last for validation
  train_set = pd.concat([train_set, group.iloc[:num_ratings - 2]])  # Rest for training


User A11QWW7ZWFSQLP - Number of ratings: 2
User A121XO3OU9CPZ6 - Number of ratings: 6
User A125PQE515J3JZ - Number of ratings: 6
User A12GHTT7AN4N14 - Number of ratings: 9
User A12KM32L9JURF6 - Number of ratings: 3
User A12LKEM543ILBK - Number of ratings: 6
User A12V33MJD5T3GJ - Number of ratings: 4
User A12YVS06GZEFZE - Number of ratings: 9
User A1318DRVUV8X4S - Number of ratings: 10
User A133XVTILE3HA8 - Number of ratings: 8
User A135WQ77OCZID7 - Number of ratings: 7
User A13CHNKYZAULKA - Number of ratings: 10
User A13K8XT134B52I - Number of ratings: 10
User A13WOT3RSXKRD5 - Number of ratings: 10
User A141L1MUXVSZ8H - Number of ratings: 8
User A14EBTROZF3AC7 - Number of ratings: 10
User A14GILA5IVI2Z3 - Number of ratings: 7
User A14HMP62A37RVX - Number of ratings: 9
User A14I4VHJAZ8LC - Number of ratings: 5
User A14IY5HCINZWT0 - Number of ratings: 10
User A14L97QUNT8KZE - Number of ratings: 5
User A14MRJMPYTEF31 - Number of ratings: 9
User A14NA0W8ESGDSI - Number of ratings: 9
User A