In [None]:
import pandas as pd
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import ast

# Load the CSV file
df = pd.read_csv('cleaned_comments.csv')

# Get the first 10 unique video IDs
unique_video_ids = df['Video ID'].unique()[:10]

# Loop through each video ID and generate a word cloud
for video_id in unique_video_ids:
    # Filter rows by video ID
    filtered_df = df[df['Video ID'] == video_id]

    # Extract tokens and combine into a single list
    all_tokens = []
    for tokens_str in filtered_df['Tokens']:
        tokens_list = ast.literal_eval(tokens_str)
        all_tokens.extend(tokens_list)

    # Join all tokens into a single string
    text = ' '.join(all_tokens)

    # Generate the word cloud
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)

    # Display the word cloud
    plt.figure(figsize=(10, 5))
    plt.title(f'Word Cloud for Video ID: {video_id}')
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.show()


In [88]:
import pandas as pd
from gensim import corpora, models

print(pd.__version__)

# Load the data from the CSV file into a DataFrame
comments_df = pd.read_csv('cleaned_comments.csv')
descriptions_df = pd.read_csv('descriptions.csv')

merged_df = pd.merge(comments_df, descriptions_df, on='Video ID', suffixes=('_comments', '_descriptions'))

personality_keywords = ['personality', 'friend', 'friendly', 'funny', 'house', 'home', 'love', 'beautiful']

video_ids = merged_df['Video ID'].unique()

topics_data = []

for video_id in video_ids:
    filtered_comments = merged_df[(merged_df['Video ID'] == video_id) & ~merged_df['Tokens'].str.contains('|'.join(personality_keywords))]['Tokens'].apply(eval).tolist()
    filtered_descriptions = merged_df[merged_df['Video ID'] == video_id]['Main Description'].tolist()
    
    # Convert descriptions to list of lists
    filtered_descriptions = [[description] for description in filtered_descriptions]

    # Combine comments and descriptions
    combined_text = [comment + description for comment, description in zip(filtered_comments, filtered_descriptions)]

    # Create Dictionary
    dictionary = corpora.Dictionary(combined_text)

    # Create Corpus: Term Document Frequency
    corpus = [dictionary.doc2bow(text) for text in combined_text]

    # Train LDA Model
    num_topics = 10  # Number of topics you want to discover
    lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10)
    # Save topics to the DataFrame
    for topic in lda_model.print_topics():
        topics_data.append({'Video ID': video_id, 'Topic': topic})
        
    # Print topics for the first video
    print(f"Topics for Video ID: {video_id}")
    for topic in lda_model.print_topics():
        print(topic)
        
# Convert the list of topics to a DataFrame
topics_df = pd.DataFrame(topics_data)

# Save the DataFrame to a CSV file
topics_df.to_csv('topics.csv', index=False)


2.2.2
Topics for Video ID: kavjzsRtuuA
(0, '0.141*"[\'ad\', \'welcomed\', \'hit\', \'music\', \'producer\', \'tour\', \'home\', \'los\', \'built\', \'spacious\', \'home\', \'throw\', \'away\', \'recording\', \'despite\', \'proximity\', \'design\', \'brief\', \'help\', \'rachel\', \'leigh\', \'ward\', \'dana\', \'vitrano\', \'bonvivant\', \'interiors\', \'designer\', \'keefe\', \'butler\', \'studio\', \'resulting\', \'space\', \'eclectic\', \'playful\', \'plenty\', \'kick\', \'back\', \'relax\', \'family\', \'including\', \'superstar\', \'girlfriend\', \'selena\', \'red\', \'velvet\', \'movie\', \'theatre\', \'candy\', \'room\', \'dressing\', \'home\', \'full\', \'personality\', \'fun\', \'know\', \'house\', \'friend\', \'told\', \'love\', \'coming\', \'house\', \'feel\', \'like\', \'let\', \'go\', \'true\', \'exactly\', \'going\', \'shop\', \'pieces\', \'inspired\', \'la\', \'buy\', \'something\', \'retail\', \'earn\', \'affiliate\', \'see\', \'la\', \'home\', \'artwork\', \'courtesy\'

KeyboardInterrupt: 

In [125]:
import pandas as pd
import ast

# Read the topics.csv file into a DataFrame
topics_df = pd.read_csv('topics.csv')

# Convert the 'Topic' column to string type
topics_df['Topic'] = topics_df['Topic'].astype(str)

# Function to clean and parse the topic strings
def clean_and_parse_topic(topic_str):
    try:
        # Remove leading and trailing parentheses and any tuple formatting
        topic_str = topic_str.strip("()")
        parts = topic_str.split(", ", 1)
        if len(parts) > 1:
            topic_str = parts[1]
        
        # Replace specific problematic characters
        cleaned_str = topic_str.replace('""', '').replace("'", "").replace('\\', '')
        
        # Extract the description weight
        desc_weight = float(cleaned_str.split("*")[0])
        
        # Extract the topics
        topics = []
        parts = cleaned_str.split(" + ")
        for part in parts:
            if '*' in part:
                _, topic = part.split('*', 1)
                topic = topic.strip()
                topics.append(topic)
        return desc_weight, topics
    except Exception as e:
        print(f"Error processing '{topic_str}': {e}")
        return None, None


# Group by 'Video ID'
grouped = topics_df.groupby('Video ID')

# Function to extract top 5 topics based on weights
def get_top_5_topics(group):
    topics = []
    for _, row in group.iterrows():
        desc_weight, topic_info = clean_and_parse_topic(row['Topic'])
        if topic_info:
            topics.append((desc_weight, topic_info))
    # Sort topics based on description weights and select the top 5
    top_5_topics = sorted(topics, key=lambda x: x[0], reverse=True)[:5]
    return [(weight, topic) for weight, topic in top_5_topics]


# Apply the function to each group and concatenate the results
result = grouped.apply(get_top_5_topics).reset_index()
result.columns = ['Video ID', 'Top 5 Topics']

# Write the result to a new CSV file
result.to_csv('top_5_topics.csv', index=False)


  result = grouped.apply(get_top_5_topics).reset_index()


In [191]:
import csv
import ast  # For safe evaluation of string representations of lists

# Open the input and output files
with open('top_5_topics.csv', 'r', newline='', encoding='utf-8') as input_file, \
     open('cleaned_top_5_topics.csv', 'w', newline='', encoding='utf-8') as output_file:
    reader = csv.reader(input_file)
    writer = csv.writer(output_file)

    writer.writerow(['Video ID', 'Weight', 'Topics'])

    next(reader)

    # Iterate over each row in the input file
    for row in reader:
        video_id = row[0]  # Extract video ID
        topics = eval(row[1])  # Convert the string representation of topics to a list

        # Iterate over each topic in the list
        for weight, topic_list in topics:
            topic_list = [topic for topic in topic_list if not topic.startswith('[') and not topic.endswith(']')]
            topic_string = ', '.join(topic_list)  # Convert the topic list to a string
            writer.writerow([video_id, weight, topic_string])

print("Cleaning complete. Output saved to cleaned_top_5_topics.csv")

Cleaning complete. Output saved to cleaned_top_5_topics.csv


In [194]:
import pandas as pd
import ast

# Read the CSV file
df = pd.read_csv("cleaned_top_5_topics.csv")

# Function to process the Topics column
def process_topics(topics): 
    # Remove the triple quotes and split the string into two parts
    parts = topics.replace('"""', '').split('", "', 1)
    
    # Check if there are two parts
    if len(parts) == 2:
        # Add a closing quote to the second part if it's missing
        if not parts[1].endswith('"'):
            parts[1] += '"'
        # Parse the second part as a list and return it as a string
        try:
            return str(ast.literal_eval(parts[1]))
        except SyntaxError:
            return parts[1]
    else:
        # If there's only one part, return it as is
        return parts[0]


# Apply the function to the Topics column
df['Topics'] = df['Topics'].apply(process_topics)


df.to_csv("cleaned_top_5_topics.csv", index=False)


In [215]:

from collections import defaultdict, Counter

df = pd.read_csv("cleaned_top_5_topics.csv")

topics_dict = defaultdict(set)
word_counter = Counter()

remove_words = ["love", "loved", "lovely", "house", "home", "like", "gorgeous", "amazing", "beautiful", "ever", 
                "lol", "omg", "nice", "like", "cool", "best", "looks", "would", "one", "much", "room",
                "look", "wow", "see", "good", "know", "stunning", "people", "want", "style", "seen", "place", 
                "video", "taste", "cute", "everything", "well", "absolutely", "get", "space", "anyone", "tour",
                "apartment", "money", "design", "vibe", "could", "seems", "way", "favorite", "happy", "rich", "thank",
                "two", "still", "need", "every", "feel", "show", "many", "got", "damn", "time", "please", "thought", "blurred",
                "homes", "voice", "someone", "yes", "actually", "done", "god", "life", "going", "ariel", "pretty", "laugh", "person",
                "together", "though", "always", "thing", "vibes", "also", "day", "else", "episode", "say", "que", "think", "melo", "little",
                "loves", "awesome", "never", "rooms", "proud", "riverdale", "job", "queen", "watch", "sharing", "thanks", "lady", "married", 
                "right", "baby", "houses"]

# Process the data
for index, row in df.iterrows():
    video_id, weight, topics = row['Video ID'], row['Weight'], row['Topics']
    # Remove extra quotation marks
    topics = topics.replace('"', '')
    topics = topics.split(", ")
    word_counter.update(topics)
    topics = [topic for topic in topics if topic not in remove_words]
    topics_dict[video_id].update(topics)
    
sd_topics = pd.DataFrame(columns=['Video ID', 'Topics'])

# Print the results
for video_id, topics in topics_dict.items():
    print(f"{video_id}: {', '.join(topics)}")
    new_row = pd.DataFrame({'Video ID': [video_id], 'Topics': [', '.join(topics)]})
    sd_topics = pd.concat([sd_topics, new_row], ignore_index=True)
        
sd_topics.to_csv('sd_topics.csv', index=False)

print("\nMost common words:")
for word, count in word_counter.most_common(10):  
    print(f"{word}: {count}")


-TeeIEh2IE8: gallery, second, elmo, remember, trophies, keep, art, chair, nepal, cold, body, museum, magazine, times, trash, trophy, door, unique
-zkOLKMiX9c: woman, art, pepper, old, body, white
0K9eZCW-lvc: tate, kitchen, body, sweet, something, dress, cozy
0_BEwbnmRZ4: sounds, leave, panic, humor, jazzy, guy, hilarious, man, monotone, spanish
5WH-PSs9hI8: mama, fracking, drag, black, disco, color, closet, joy, white
5vNoHwgoIjw: coop, chickens, fish, mcguire, samsung, chicken, girl, sons, lizzie, kids, dress
5zvsAwsEwPg: couple, story, sweet, joy, wife
7R2RyqU4hjI: title, accent, garden, couple, catherine, pot, mcnulty, haunted, castle, wonderful, ancestors, family, needs, irish
7Z76MiQv0zg: sword, woman, lotr, soul, soothing, magnolia, attic, display, arwen, sweet, tree, asmr
7cGmj0AR3z8: creepy, statue, colombian, jersey, guy, bedroom, coffee, quality, casa, colombia, handsome, neymar, hot
80z6drP0NY8: player, miss, mind, hard, furniture, guy, future, come, man, decor, vacation, n

In [79]:
from diffusers import StableDiffusionPipeline
import torch
print("CUDA available:", torch.cuda.is_available())

# Load the model
model_id = "CompVis/stable-diffusion-v1-4"
device = "cuda" if torch.cuda.is_available() else "cpu"

pipe = StableDiffusionPipeline.from_pretrained(model_id)
pipe = pipe.to(device)

prompt = "A futuristic cityscape at sunset"
image = pipe(prompt)["sample"][0]

# Display the generated image
image.show()



CUDA available: False




Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

KeyboardInterrupt: 