In [52]:
import pandas as pd
import mysql.connector as mysql

def connect_to_mysql(host, user, password, database):
    """
    Establishes a connection to the MySQL database.
    """
    try:
        connection = mysql.connect(
            host=host,
            user=user,
            password=password,
            database=database
        )
        print("Connection to MySQL established successfully.")
        return connection
    except mysql.Error as e:
        print("Error connecting to MySQL:", e)
        raise

def populate_review_table(csv_file, connection):
    """ 
    Reads a CSV file and inserts data into the MySQL 'review' table.
    """
    # Load CSV into a Pandas DataFrame
    data = pd.read_csv(csv_file)

    #Replace NaN values with None
    data = data.where(pd.notnull(data), None)
    data.replace("nan", None, inplace=True)

    #Prepare the MySQL queries
    insert_review_query = """ 
    INSERT INTO review (account_id, media_id, rating)
    VALUES (%s, %s, %s)
    """

    insert_account_query = """ 
    INSERT INTO account (total_reviews)
    VALUES (%s)
    """

    try:
        with connection.cursor(dictionary=True) as cursor:

            #Initialize iterator for debugging
            i=0

            for _, row in data.iterrows():
                

                # Insert new record into account table if new userID

                # Check if the account already exists
                cursor.execute("SELECT id FROM account WHERE id = %s", (row['userId'],))
                result = cursor.fetchone()  # Ensure we fetch the result completely

                if result:
                    # If account exists, fetch its ID
                    userID = result['id']
                else:
                    # If account doesn't exist, insert it and fetch the ID
                    cursor.execute(insert_account_query, (0,))
                    connection.commit()  # Commit to ensure the insertion is completed
                    print("UserID inserted!")
                    cursor.execute("SELECT id FROM account WHERE id = %s", (row['userId'],))
                    result = cursor.fetchone()
                    if result:
                        userID = result['id']

                # Find the appropriate media ID
                temp = movie_data[movie_data['id'] == row["movieId"]]

                if not temp.empty:
                    title = temp['title'].values[0]  # Extract the title as a string
                    cursor.execute("SELECT id FROM media WHERE name = %s", (title,))
                    result = cursor.fetchone()  # Fully fetch the result
                    if result:
                        movieID = result['id']
                    else:
                        print(f"Media not found for title: {title}")
                        #continue  # Skip this row if no matching media ID is found
                else:
                    print(f"Movie ID {row['movieId']} not found in movie_data")
                    #continue  # Skip this row if no matching movie ID is found

                # Insert into the review table
                print("Attempting to insert: " + str(row['userId']) + ", " + str(row['movieId']) + ", " + str(row['rating']))
                cursor.fetchall()
                cursor.execute(insert_review_query, (
                    userID,
                    movieID,
                    row['rating']
                ))
                
                print("Row " + str(i) + " added successfully.")
                i+=1
        
            connection.commit()
    except Exception as e:
        connection.rollback()
        print("Error inserting data:", e)
        print("Problematic row: " + str(row))
        raise       

def populate_media_table(csv_file, connection):
    """
    Reads a CSV file and inserts data into the MySQL `media` table.
    """
    # Load CSV into a Pandas DataFrame
    data = pd.read_csv(csv_file)

    # Replace null values in 'title' with 'original_title'
    data['title'] = data['title'].fillna(data['original_title'])

    # Replace NaN values with None
    data = data.where(pd.notnull(data), None)
    data.replace("nan", None, inplace=True)
    
    #Serial counter for the insertion
    serial = 0
    # Prepare the MySQL queries
    insert_media_query = """
    INSERT INTO media (type, genre, genre2, genre3, date_released, studio, name, full_average, total_reviews)
    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
    """

    insert_genre_query = """
    INSERT IGNORE INTO genre (name) VALUES (%s)
    """

    insert_type_query = """
    INSERT IGNORE INTO type (name) VALUES (%s)
    """

    try:
        with connection.cursor(dictionary=True) as cursor:
            for _, row in data.iterrows():
                # Process the genres (assuming the `genres` field is JSON-like in the CSV)
                genres = eval(row['genres']) if row['genres'] != "[]" else []  # Safely handle empty genres
                # Initialize genre IDs and names(maximum 3)
                genre_ids = [None, None, None]
                genre_names = [None, None, None]

                if genres:
                    # Add genres to the genre table and retrieve their IDs
                    for i, genre in enumerate(genres[:3]):  # Limit to 3 genres
                        genre_name = genre['name']

                        # Check if the genre already exists
                        cursor.execute("SELECT id FROM genre WHERE name = %s", (genre_name,))
                        result = cursor.fetchone()

                        if result:
                            # If genre exists, fetch its ID and name
                            genre_ids[i] = result['id']
                            genre_names[i] = genre_name
                        else:
                            # If genre doesn't exist, insert it and fetch the ID
                            cursor.execute(insert_genre_query, (genre_name,))
                            connection.commit()  # Commit to ensure the insertion is completed
                            cursor.execute("SELECT id FROM genre WHERE name = %s", (genre_name,))
                            result = cursor.fetchone()
                            if result:
                                genre_ids[i] = result['id']
                                genre_names[i] = genre_name

                    #Add type to the type table and retrieve its ID
                    if genre_names[0] == "TV Movie":
                        type_name = "Television"
                    elif genre_names[0] == "Documentary":
                        type_name = "Documentary"
                    else:
                        type_name = "Movie"
                     # Check if the type already exists
                    cursor.execute("SELECT id FROM type WHERE name = %s", (type_name,))
                    result = cursor.fetchone()

                    if result:
                        # If type exists, fetch its ID
                        type_id = result['id']
                    else:
                        # If genre doesn't exist, insert it and fetch the ID
                        cursor.execute(insert_type_query, (type_name,))
                        connection.commit()  # Commit to ensure the insertion is completed
                        cursor.execute("SELECT id FROM type WHERE name = %s", (type_name,))
                        result = cursor.fetchone()
                        if result:
                            type_id = result['id']
                    
                # Process the producers (assuming the `producers` field is JSON-like in the CSV)
                studios = eval(row['production_companies']) if row['production_companies'] != "[]" else []  # Safely handle empty studios
                
                if studios:
                    # Obtain first studio name
                    first_studio_name = studios[0]['name']

                # Insert into the media table
                cursor.execute(insert_media_query, (
                    type_id,
                    genre_ids[0],  # Use the genre's ID or None if no genres are available
                    genre_ids[1],
                    genre_ids[2],
                    row['release_date'],
                    first_studio_name,  #Use the first studio's name only
                    row['title'],
                    row['vote_average'],
                    row['vote_count']
                ))
                
                print("Row " + str(serial) + " inserted.")
                serial +=1
        # Commit the transaction
        connection.commit()
        print("Data successfully inserted into MySQL.")
    except Exception as e:
        connection.rollback()
        print("Error inserting data:", e)
        print("Problematic row: " + str(row))
        raise

def main():
    # MySQL connection parameters
    host = "localhost"
    user = "root"
    password = "Mckennasmith080419"
    database = "cp317_schema"

    # Path to your CSV files
    movie_file = r"C:\Users\zachr\OneDrive\Desktop\CP317 - Software Engineering\movies_data.csv"
    review_file = r"C:\Users\zachr\OneDrive\Desktop\CP317 - Software Engineering\Data\ratings.csv"

    
    

    # Connect to MySQL and insert data
    connection = connect_to_mysql(host, user, password, database)
    try:
        #populate_media_table(movie_file, connection)
        populate_review_table(review_file, connection)
    finally:
        connection.close()
    
    

movie_data = pd.read_csv(r"C:\Users\zachr\OneDrive\Desktop\CP317 - Software Engineering\movies_data.csv")

if __name__ == "__main__":
    main()


Connection to MySQL established successfully.
Attempting to insert: 1.0, 110.0, 1.0
Row 0 added successfully.
Attempting to insert: 1.0, 147.0, 4.5
Row 1 added successfully.
Attempting to insert: 1.0, 858.0, 5.0
Row 2 added successfully.
Movie ID 1221.0 not found in movie_data
Attempting to insert: 1.0, 1221.0, 5.0
Error inserting data: 1062 (23000): Duplicate entry '1-535' for key 'review.PRIMARY'
Problematic row: userId       1.000000e+00
movieId      1.221000e+03
rating       5.000000e+00
timestamp    1.425942e+09
Name: 3, dtype: float64


IntegrityError: 1062 (23000): Duplicate entry '1-535' for key 'review.PRIMARY'