# Preparing data for analysis

Before we can start analyzing the data, we need to prepare it. This includes cleaning the data and transforming it so that it is easier to obtain the information we want.

### Importing libraries

In [136]:
from dotenv import load_dotenv
import os
import pandas as pd
import json
from Project.Helpers.database import Database
from openai import OpenAI
from Project.Helpers.Classes import User
import ast

### API setup

In [137]:
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
#APIS
llm = OpenAI(
    api_key=OPENAI_API_KEY
)


#API FUNCTIONS
def get_completion_from_messages(messages, model='gpt-4o', temperature=0, max_tokens=4096):
    response = llm.chat.completions.create(
        model=model,
        messages=messages,
        temperature=temperature,
        max_tokens=max_tokens,
        response_format={"type": "json_object"}
    )
    return response.choices[0].message.content



### Setting the parameters

In [138]:
user_ids = [2,22, 12, 28, 7]
user_id = 2

### Loading the user data 

In [139]:
def initialize_user(user_id):
    with open('../Input_data/users.json') as f:
        users = json.load(f)
    user = User([user for user in users if user['id'] == user_id][0])

    return user.username, user.interests

username, interests = initialize_user(user_id)

### Loading the data

In [140]:
database = Database('../database.db')

videos = database.get_videos_sorted_by_oldest_with_metadata(user_id=user_id)

classifications = pd.DataFrame(videos)


Making sure the data is loaded correctly

In [141]:
classifications

Unnamed: 0,id,url,video_id,user_id,parent_classification,classification,reaction,reason,execution_time,timestamp,title,description,tags,playtime,channel_name,youtube_category,youtube_topics
0,240,https://www.youtube.com/shorts/izPP2qTQf-g,izPP2qTQf-g,2,Music,Eurovision,['skip'],The user would skip the video because it is pr...,2.7263605594635,2024-06-08 18:52:47,Netherlands 🇳🇱 Backstage at the Turquoise Carp...,,,10.0,Yura Poghossian,People & Blogs,['Music']
1,241,https://www.youtube.com/shorts/pKZzDo8SzOE,pKZzDo8SzOE,2,Comedy,Shorts,['skip'],The user would skip the video because it is ca...,2.42585802078247,2024-06-08 18:52:50,PEDRO 🤪🦆 #shorts #adanifamily,,,9.0,Adani Family,Comedy,['Lifestyle_(sociology)']
2,242,https://www.youtube.com/shorts/sWLU3trCl6I,sWLU3trCl6I,2,Entertainment,Gaming,['skip'],The video is classified under Gaming and Enter...,2.71519923210144,2024-06-08 18:52:54,EM 10001 Video NA Puzzle 2N OutTreausureMonste...,EM 10001 Video NA Puzzle 2N OutTreausureMonste...,,29.0,Evony,People & Blogs,[]
3,243,https://www.youtube.com/shorts/IErTfXHG5lY,IErTfXHG5lY,2,Howto & Style,Sports,['skip'],The video is about sports but the title and de...,3.16527032852173,2024-06-08 18:52:58,My sister at Netherlands nation team,wwwhandyitemscom.bigcartel.com,,7.0,MR,Howto & Style,['Sport']
4,244,https://www.youtube.com/shorts/3jOJUUVmayc,3jOJUUVmayc,2,Lifestyle,Food,['skip'],The user would skip the video because it is ab...,2.30350661277771,2024-06-08 18:53:02,"Ăn vặt tuổi thơ 2 p2665:, #shorts",,,16.0,Crazy Santu,Entertainment,"['Food', 'Lifestyle_(sociology)']"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
935,1175,https://www.youtube.com/shorts/cGvs03nfdfM,cGvs03nfdfM,2,Sports,Football,['watch'],"The user has an interest in sports, and the vi...",2.70490789413452,2024-06-09 01:18:51,Toni Kroos Evolution 2006-2024,#football #foot #footballedits #footballshorts...,"['toni kroos', 'kroos', 'vinicius', 'vinijr', ...",61.0,champagneyaya,Sports,"['Association_football', 'Sport']"
936,1176,https://www.youtube.com/shorts/0ThGDdsz0c8,0ThGDdsz0c8,2,Hobby,Rubik's Cube,['watch'],The user would watch the video because it is r...,2.64732956886291,2024-06-09 01:19:59,2012 Rubik’s cube solve blindfold world record...,,,39.0,Cubix,Education,"['Hobby', 'Lifestyle_(sociology)']"
937,1177,https://www.youtube.com/shorts/9oLGOz4sQPg,9oLGOz4sQPg,2,Entertainment,Sports,['watch'],The user would watch the video because it is r...,2.75018906593323,2024-06-09 01:20:44,Ronaldo shows some love!❤️,,,33.0,MOVEMANIA,People & Blogs,['Entertainment']
938,1178,https://www.youtube.com/shorts/Dhtwz5VWkOY,Dhtwz5VWkOY,2,Pets & Animals,Dog Training,['watch'],The user would watch the video because it is r...,3.10743451118469,2024-06-09 01:21:24,Sapphie the pomsky tells daddy to make dinner ...,Sapphie the pomsky tells daddy to make dinner ...,"['sapphiethepomsky', 'sapphie', 'pomsky', 'pom...",18.0,sapphie_the_pomsky,Pets & Animals,"['Lifestyle_(sociology)', 'Pet']"


Checking data for missing values

In [142]:
classifications.isnull().sum()

id                       0
url                      0
video_id                 0
user_id                  0
parent_classification    0
classification           0
reaction                 0
reason                   0
execution_time           0
timestamp                0
title                    0
description              0
tags                     0
playtime                 0
channel_name             0
youtube_category         0
youtube_topics           0
dtype: int64

### Recoding classifications

As the classifications were done by an AI in an unbounded setting some of the classifications are very similar to each other. Therefore we will ask the AI to combine similar classifications into one.

In [143]:
# Get all unique classifications and parent classifications
unique_classifications = classifications['classification'].unique()
unique_parent_classifications = classifications['parent_classification'].unique()

# Combine similar classifications using a llm

delimiter = '####'

system_message_remap = '''

Here is a list of classifications. Some of the classifications are very similar to each other. Can you group similar classifications together? 

What is meant by "similar" is that the classifications are related to the same topic or category. For example, "Football Highlights" and "Soccer Highlights" are similar because they are both related to Football it is just different ways of saying the same thing. Furthermore some categories encompass other categories for example "Pets and Animals" encompasses "Pets" so they should be combined into one category as well.

Example:

{
    "Advertising": ["Advertising"],
    "Art": ["Art", "Performing Arts"],
    "Automotive": ["Automotive", "Autos & Vehicles", "Vehicle", "Vehicles"]
}

Ensure that the response is in JSON format but do not return code, only the JSON object as a string.

'''

user_message_remap = '''
classifications: {classifications}
'''

messages_remap_parents = [
    {'role': 'system',
     'content': system_message_remap},
    {'role': 'user',
     'content': f"{delimiter}{user_message_remap.format(classifications=unique_parent_classifications)}{delimiter}"},
]

messages_remap = [
    {'role': 'system',
     'content': system_message_remap},
    {'role': 'user',
     'content': f"{delimiter}{user_message_remap.format(classifications=unique_classifications)}{delimiter}"},
]

Now that we have the remapping we can apply it to the data

In [144]:
response_parents = get_completion_from_messages(messages_remap_parents)
parent_classification_remapping = json.loads(response_parents)
parent_classification_remapping

{'Music': ['Music'],
 'Comedy': ['Comedy', 'Memes'],
 'Entertainment': ['Entertainment', 'Film & Animation', 'Film'],
 'Howto & Style': ['Howto & Style', 'How-to'],
 'Lifestyle': ['Lifestyle', 'Beauty', 'Fashion'],
 'Gaming': ['Gaming', 'Games', 'Video Games'],
 'Sports': ['Sports', 'Sport'],
 'Performing Arts': ['Performing Arts'],
 'People & Blogs': ['People & Blogs'],
 'Technology': ['Technology', 'Science & Technology'],
 'Business': ['Business', 'Finance', 'Marketing'],
 'Pets & Animals': ['Pets & Animals', 'Pets', 'Nature'],
 'Automotive': ['Automotive', 'Vehicles', 'Vehicle', 'Autos & Vehicles'],
 'Health': ['Health'],
 'Knowledge': ['Knowledge', 'Education'],
 'Relationships': ['Relationships'],
 'Food': ['Food'],
 'Hobby': ['Hobby', 'Hobbies'],
 'Photography': ['Photography'],
 'Society': ['Society'],
 'Travel & Events': ['Travel & Events', 'Travel'],
 'Art': ['Art']}

In [145]:
'''
It is not necessary to run this code and often it does not work because the response is too long 
'''
# response = get_completion_from_messages(messages_remap)
# classification_remapping = json.loads(response)
# classification_remapping

'\nIt is not necessary to run this code and often it does not work because the response is too long \n'

In [146]:
# Remap parent classifications where the key is the new classification and the value is the list of old classifications that map to it

def classification_change(classification, remapper):
    for key, value in remapper.items():
        if classification in value:
            return key
    return classification

'''
It is not necessary to run this code as classifications are supposed to be quite ambiguous and detailed. 
Futhermore, the number of categories is often too long to be processed by the API which is why we are only remapping the parent classifications.
'''
# classifications['remapped_classification'] = classifications['classification'].apply(
#     lambda x: classification_change(x, classification_remapping))

classifications['remapped_parent_classification'] = classifications['parent_classification'].apply(
    lambda x: classification_change(x, parent_classification_remapping))

We can also recode the reactions to be binary values

In [147]:
def isPositive(reaction):
    for i in reaction:
        if i in ['like', 'watch', 'share', 'positive']:
            return True
    return False

def isNegative(reaction):
    for i in reaction:
        if i in ['dislike', 'skip', 'negative']:
            return True
    return False

classifications['reaction'] = classifications['reaction'].apply(ast.literal_eval)
classifications['positive'] = classifications['reaction'].apply(lambda x: 1 if isPositive(x) else 0)
classifications['negative'] = classifications['reaction'].apply(lambda x: 1 if isNegative(x) else 0)

Now we can save the data to a csv file so that it can be used for analysis

In [148]:
data_path = f'../Output_data/classifications_{username}.csv'
classifications.to_csv(data_path, index=False)