# Data cleaning and preparation

In [16]:
import sys, os
import pandas as pd

1 step: we cleaned basically duplicates, and zero values from data. Moreover, made some features into int type and some to str type to ensure soothe analysis moving further.

In [17]:
movies_file = os.path.join(os.getcwd(), 'data', 'movie_titles_with_details.csv')

# Load movie titles with error handling and proper quote handling
try:
    movie_titles = pd.read_csv(
        movies_file 
    )
    print(movie_titles.head(20))  # Print the first 20 lines to inspect the data
except FileNotFoundError:
    print(f"File not found: {movies_file}")
    print("Please check the path and ensure the file exists.")
except UnicodeDecodeError as e:
    print(f"Encoding error: {e}")
except pd.errors.ParserError as e:
    print(f"Parser error: {e}")


    movieID    year                                              title   
0   movieID    year                                              title  \
1         1  2003.0                                    Dinosaur Planet   
2         2  2004.0                         Isle of Man TT 2004 Review   
3         3  1997.0                                          Character   
4         4  1994.0                       Paula Abdul's Get Up & Dance   
5         5  2004.0                           The Rise and Fall of ECW   
6         6  1997.0                                               Sick   
7         7  1992.0                                              8 Man   
8         8  2004.0                         What the #$*! Do We Know!?   
9         9  1991.0                           Class of Nuke 'Em High 2   
10       10  2001.0                                            Fighter   
11       11  1999.0                     Full Frame: Documentary Shorts   
12       12  1947.0                   

In [18]:
movie_titles = movie_titles.iloc[1:].reset_index(drop=True)

In [19]:
movie_titles.isnull().value_counts()

movieID  year   title  genres  runtime  original_language  popularity  adult
False    False  False  False   False    False              False       False    3572
                       True    True     True               True        True      861
                               False    False              False       False     151
Name: count, dtype: int64

In [20]:
movie_titles_w_na = movie_titles.dropna()

In [21]:
movie_titles_w_na.isnull().value_counts()

movieID  year   title  genres  runtime  original_language  popularity  adult
False    False  False  False   False    False              False       False    3572
Name: count, dtype: int64

In [22]:
movie_titles_w_na.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3572 entries, 0 to 4583
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   movieID            3572 non-null   object 
 1   year               3572 non-null   object 
 2   title              3572 non-null   object 
 3   genres             3572 non-null   object 
 4   runtime            3572 non-null   float64
 5   original_language  3572 non-null   object 
 6   popularity         3572 non-null   float64
 7   adult              3572 non-null   object 
dtypes: float64(2), object(6)
memory usage: 251.2+ KB


In [23]:
movie_titles_w_na['movieID'] = pd.to_numeric(movie_titles_w_na['movieID'], errors='coerce').astype('Int64')
movie_titles_w_na['year'] = pd.to_numeric(movie_titles_w_na['year'], errors='coerce').astype('Int64')
movie_titles_w_na['title'] = movie_titles_w_na['title'].astype(str)
movie_titles_w_na['genres'] = movie_titles_w_na['genres'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_titles_w_na['movieID'] = pd.to_numeric(movie_titles_w_na['movieID'], errors='coerce').astype('Int64')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_titles_w_na['year'] = pd.to_numeric(movie_titles_w_na['year'], errors='coerce').astype('Int64')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-co

In [24]:
movie_titles_w_na.isnull().value_counts()

movieID  year   title  genres  runtime  original_language  popularity  adult
False    False  False  False   False    False              False       False    3572
Name: count, dtype: int64

2 step: we took only 1 part of data related to users information about watched movies and ratings. Limited to 300 million rows due to lack of computer capabilities to handle bigger data set.

In [25]:
data_file = os.path.join(os.getcwd(), 'Data', 'combined_data_1.txt')

movie_ids = []
customer_ids = []
ratings = []
rows_processed = 0
max_rows = 300000000

try:
    with open(data_file, 'r') as file:
        current_movie_id = None
        for line in file:
            if rows_processed >= max_rows:
                break
            
            line = line.strip()
            if line.endswith(':'):
                current_movie_id = line.replace(':', '')
            else:
                if current_movie_id is not None:
                    parts = line.split(',')
                    customer_id = parts[0]
                    rating = parts[1]
                    movie_ids.append(current_movie_id)
                    customer_ids.append(customer_id)
                    ratings.append(rating)
                    rows_processed += 1

    df = pd.DataFrame({
        'movieID': movie_ids,
        'customerID': customer_ids,
        'rating': ratings
    })

    df['movieID'] = df['movieID'].astype(int)
    df['customerID'] = df['customerID'].astype(int)
    df['rating'] = df['rating'].astype(float)

    print(df.head())

except FileNotFoundError:
    print(f"File not found: {data_file}")
    print("Please check the path and ensure the file exists.")
except Exception as e:
    print(f"An error occurred: {e}")

   movieID  customerID  rating
0        1     1488844     3.0
1        1      822109     5.0
2        1      885013     4.0
3        1       30878     4.0
4        1      823519     3.0


In [26]:
merged_df = pd.merge(df, movie_titles_w_na, on='movieID')
merged_df.isnull().value_counts()

movieID  customerID  rating  year   title  genres  runtime  original_language  popularity  adult
False    False       False   False  False  False   False    False              False       False    21784674
Name: count, dtype: int64

3 step: merged users related data with information about movies. Dropped `title` and `adult` feature due to reduction of dataset size and `adult` feature having only FALSE values.

In [30]:
merged_df = merged_df.drop(columns=['title','adult'])

In [31]:
merged_df.to_parquet('Data/Data_for_model.parquet', index=False)