## Download files from shared links from my drive and import all libraries

In [None]:
# download files on new runtime

!gdown  '1ANWJFt67CB2Kyr57NeeW8Kpc0A0xvc2i' --output USvideos.csv
!gdown  '1gDYYDqbBIv4qLx_YTzM29f6XXp0RekpL' --output US_category_id.json

Downloading...
From: https://drive.google.com/uc?id=1ANWJFt67CB2Kyr57NeeW8Kpc0A0xvc2i
To: /content/USvideos.csv
100% 2.98M/2.98M [00:00<00:00, 25.7MB/s]
Downloading...
From: https://drive.google.com/uc?id=1gDYYDqbBIv4qLx_YTzM29f6XXp0RekpL
To: /content/US_category_id.json
100% 8.50k/8.50k [00:00<00:00, 19.6MB/s]


Import all libraries

In [None]:
import itertools

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
import json
import seaborn as sns

# Apriori algorithm for categories - Jack

## Load data to pandas dataframe

In [None]:
# on_bad_lines='skip'
df_us_videos = pd.read_csv("USvideos.csv")
print("US videos shape: ", df_us_videos.shape)

US videos shape:  (8000, 11)


  and should_run_async(code)


## Add a new column with the actual category name corresponding to the category_id

In [None]:
import json

# Opening JSON file
f = open('US_category_id.json')

# returns JSON object as
# a dictionary
us_category_id = json.load(f)

# Closing file
f.close()

# category actual name
list_cat_name = []

for i, row in df_us_videos.iterrows():
  id = str(row["category_id"])
  for item in us_category_id["items"]:
    # print(item)
    if id == item["id"]:
      list_cat_name.append(item["snippet"]["title"])
      break

df_us_videos.insert(len(df_us_videos.columns),"category_name", list_cat_name)
print(df_us_videos.head(20)[["category_id", "category_name"]])

  and should_run_async(code)


    category_id         category_name
0            24         Entertainment
1            28  Science & Technology
2            22        People & Blogs
3            28  Science & Technology
4            23                Comedy
5             1      Film & Animation
6            23                Comedy
7            28  Science & Technology
8            22        People & Blogs
9             1      Film & Animation
10           23                Comedy
11           23                Comedy
12           23                Comedy
13           24         Entertainment
14           24         Entertainment
15           28  Science & Technology
16           24         Entertainment
17            1      Film & Animation
18           26         Howto & Style
19            1      Film & Animation


## Find bins of three tier views; can use different methods.

In [None]:
# 41,500,672
maxView = df_us_videos['views'].max()

# 0
minView = df_us_videos['views'].min()

def equal_depth_partitioning(data, num_bins):
    # Sort the data
    sorted_data = np.sort(data)

    # Calculate the number of elements per bin
    elements_per_bin = len(data) // num_bins

    # Initialize list to store bin edges
    bin_edges = []

    # Iterate through the bins
    for i in range(1, num_bins):
        # Calculate the index for the bin edge
        edge_index = i * elements_per_bin

        # Append the edge value to the list
        bin_edges.append(sorted_data[edge_index])

    # Add the last element of the sorted data as the last edge
    bin_edges.append(sorted_data[-1])

    return bin_edges

view_edges = equal_depth_partitioning(df_us_videos['views'].tolist(), 3)

lowView = view_edges[0]
mediumView = view_edges[1]
HighView = view_edges[2]
print(f"Low view is less or euqal {lowView}\nMedium view is less or euqal {mediumView}\nHigh view is anything above Medium view")

Low view is less or euqal 150965
Medium view is less or euqal 657842
High view is anything above Medium view


In [None]:
df_copy = df_us_videos
# df_copy = df_us_videos.copy()

list_view_tier = []

for i, row in df_copy.iterrows():
  v = row["views"]
  if v <= lowView:
    list_view_tier.append("Low View")
  elif v <= mediumView:
    list_view_tier.append("Medium View")
  else:
    list_view_tier.append("High View")

df_copy.insert(len(df_copy.columns),"view_tier", list_view_tier)
print(df_copy.head(20)[["category_id", "category_name", "view_tier"]])

    category_id         category_name    view_tier
0            24         Entertainment    High View
1            28  Science & Technology    High View
2            22        People & Blogs    High View
3            28  Science & Technology    High View
4            23                Comedy    High View
5             1      Film & Animation    High View
6            23                Comedy    High View
7            28  Science & Technology    High View
8            22        People & Blogs    High View
9             1      Film & Animation    High View
10           23                Comedy    High View
11           23                Comedy  Medium View
12           23                Comedy  Medium View
13           24         Entertainment  Medium View
14           24         Entertainment  Medium View
15           28  Science & Technology  Medium View
16           24         Entertainment  Medium View
17            1      Film & Animation    High View
18           26         Howto &

## The Apriori algorithm

In [None]:
"""
Apriori algorithm helps find frequent itemsets in a database based on associations between the presence or absence of items.

For e.g. the algorithm would discover that when a customer buys bread, they often end up buying butter & eggs as well.
This indicates a strong association between these items. These associations help businesses to make decisions to improve sales, customer satisfaction, etc.

To improve the efficiency of level-wise generation of frequent itemsets, an important property is used called Apriori property which helps by reducing the search space.
Apriori assumes that,
    All subsets of a frequent itemset must be frequent(Apriori property).
    If an itemset is infrequent, all its supersets will be infrequent.

https://www.engati.com/glossary/apriori-algorithm
"""

def myApriori(trans_data : list, min_sup: int, K_max: int):
    """
    min_sup: all frequent itemsets with support count ≥ min_sup
    K_max:   max size of a basket

    example input with min_sup = 2, K_max = 3:
      [['I1', 'I2', 'I5'], ['I2', 'I4'], ['I2', 'I4'], ['I1', 'I2', 'I4'], ['I1', 'I3'], ['I2', 'I3'], ['I1', 'I3'], ['I1', 'I2', 'I3', 'I5'], ['I1', 'I2', 'I3']]

    then, example output:
      [['I1', 'I2', 'I5', 'I4', 'I3'], [['I1', 'I2'],['I1', 'I5'],['I2', 'I5'],['I4', 'I2'],['I1', 'I3'],['I2', 'I3']], [['I1', 'I2', 'I5'], ['I1', 'I2', 'I3']]]
    """

    frequent_items = []
    frequentItemSupport = {}

    for transaction in trans_data:
      for itemSet in transaction:
        frequentItemSupport[itemSet]=frequentItemSupport.get(itemSet,0)+1

    frequent_items.append([item for item, count in frequentItemSupport.items()if count >= min_sup])

    for k in range(2, K_max+1):
      candidateSupport = {}
      for transaction in trans_data:
        item_combinations = itertools.combinations(set(transaction),k)
        for c in item_combinations:
          candidate = tuple(sorted(c))
          candidateSupport[candidate] = candidateSupport.get(candidate,0)+1

      frequent_itemsets = [list(set(candidate)) for candidate, count in candidateSupport.items() if count >= min_sup]
      if not frequent_itemsets:
          break

      frequent_items.append(frequent_itemsets)

    return frequent_items

In [None]:
# Testing

myApriori(
    [['I1', 'I2', 'I5'], ['I2', 'I4'], ['I2', 'I4'], ['I1', 'I2', 'I4'], ['I1', 'I3'], ['I2', 'I3'], ['I1', 'I3'], ['I1', 'I2', 'I3', 'I5'], ['I1', 'I2', 'I3']],
    2,
    3
)

[['I1', 'I2', 'I5', 'I4', 'I3'],
 [['I2', 'I1'],
  ['I2', 'I5'],
  ['I5', 'I1'],
  ['I2', 'I4'],
  ['I1', 'I3'],
  ['I2', 'I3']],
 [['I2', 'I1', 'I5'], ['I2', 'I1', 'I3']]]

In [None]:
def gen_confidenceItems_TOP(trans_data : list, min_sup: int):
    """

    """
    confidencePair = []

    frequent_items = myApriori(trans_data,min_sup,2)

    # decomposite any sublist by one level
    frequent_items_break =[item for sublist in frequent_items for item in sublist]

    frequentItemsPair = []
    for itemSet in frequent_items_break:
      if type(itemSet) is list and len(itemSet) > 1:
        for i in range(len(itemSet)):
          n =[item for item in itemSet if item != itemSet[i]]
          d =[itemSet[i]]
          rule=sorted(n)+sorted(d)
          if rule not in frequentItemsPair:
            frequentItemsPair.append(rule)

    confidencePairs = []
    for pair in frequentItemsPair:
      X = pair[:-1]

      X_Y_support = sum(1 for transaction in trans_data if set(pair).issubset(set(transaction)))
      X_support= sum(1 for transaction in trans_data if set(X).issubset(set(transaction)))
      confidence = X_Y_support / X_support if X_support != 0 else 0
      confidencePairs.append((pair, confidence, X_Y_support, X_support))

    confidencePairs.sort(key=lambda x: x[1], reverse=True)
    top = confidencePairs[:5]
    confidencePair =[pair for pair in top]

    return confidencePair



## Create Transaction, Run the Apriori algorithm

In [None]:
# the transaction data
list_trans = []

for i, row in df_us_videos.iterrows():
  list_trans.append([row["view_tier"], row["category_name"]])

print(len(list_trans))

min_sup = 100
K_max = 2

8000


In [None]:
myApriori(list_trans, 100, 2)

[['High View',
  'Entertainment',
  'Science & Technology',
  'People & Blogs',
  'Comedy',
  'Film & Animation',
  'Medium View',
  'Howto & Style',
  'Music',
  'News & Politics',
  'Low View',
  'Sports',
  'Education',
  'Autos & Vehicles',
  'Pets & Animals'],
 [['High View', 'Entertainment'],
  ['High View', 'Science & Technology'],
  ['People & Blogs', 'High View'],
  ['Comedy', 'High View'],
  ['High View', 'Film & Animation'],
  ['Comedy', 'Medium View'],
  ['Medium View', 'Entertainment'],
  ['Medium View', 'Science & Technology'],
  ['Medium View', 'Howto & Style'],
  ['High View', 'Music'],
  ['News & Politics', 'Medium View'],
  ['Low View', 'Music'],
  ['Medium View', 'Education'],
  ['Entertainment', 'Low View'],
  ['Medium View', 'Music'],
  ['Low View', 'Howto & Style'],
  ['High View', 'Howto & Style'],
  ['Sports', 'Medium View'],
  ['Science & Technology', 'Low View'],
  ['People & Blogs', 'Low View'],
  ['News & Politics', 'Low View'],
  ['Comedy', 'Low View'],
  [

In [None]:
gen_confidenceItems_TOP(list_trans, min_sup)

[(['Comedy', 'High View'], 0.582010582010582, 440, 756),
 (['Sports', 'Low View'], 0.4951219512195122, 203, 410),
 (['Education', 'Medium View'], 0.46407185628742514, 155, 334),
 (['News & Politics', 'Low View'], 0.4504792332268371, 282, 626),
 (['Film & Animation', 'High View'], 0.42063492063492064, 159, 378)]

#Apriori alg on all video tags - Silvestre

## Adding a season attri

In [None]:
#df = pd.read_csv("USvideos.csv")
#df_us_videos
# Function to determine the season from month and day
def get_season(date):
    date_str = str(date)
    day, month = int(date_str.split('.')[0]), int(date_str.split('.')[1])
    if (month > 2) and (month < 6):
        return 'Spring'
    elif (month > 5) and (month < 9):
        return 'Summer'
    elif (month > 8) and (month < 12):
        return 'Fall'
    else:
        return 'Winter'

# Apply the function to the 'date' column to create a new 'season' column
df_us_videos['season'] = df_us_videos['date'].apply(get_season)

# Optionally, save the modified DataFrame to a new CSV file
#df.to_csv('/mnt/data/USvideos_modified.csv', index=False)

# Output the DataFrame to see the changes
print(df_us_videos.head())

##Using Apriori alg to identify combinations of tags that frequently occur together in trending videos
By treating each video as a "transaction" and its tags as "items"

### **Step 1:** Creating list a list of tags for each video

In [None]:

if 'df_us_videos' in locals():
    # Creating a dictionary with 'video_id' as keys and lists of 'tags' as values
    video_tags_dict = {row['video_id']: row['tags'].split('|') for index, row in df_us_videos.iterrows() if isinstance(row['tags'], str)}

    # Optionally, display a small part of the dictionary to check
    sample_items = list(video_tags_dict.items())[:5]
    for video_id, tags in sample_items:
        print(f"Video ID: {video_id}\nTags: {tags}\n")
else:
    print("Dataframe is not defined due to an error in reading the file.")

### **Step 2:** Applying alg to get patterns

In [None]:
from IPython.display import display, HTML

# Assume video_tags_dict is already defined
video_tags = list(video_tags_dict.values())
encoder = TransactionEncoder()
transformed_data = encoder.fit_transform(video_tags)
df2 = pd.DataFrame(transformed_data, columns=encoder.columns_)

# Step 2: Apply the Apriori algorithm to find frequent itemsets
frequent_itemsets = apriori(df2, min_support=0.019, use_colnames=True)
#frequent_itemsets = apriori2(df1, 100, 3);
# Sort the frequent itemsets by support
frequent_itemsets_sorted = frequent_itemsets.sort_values(by='support', ascending=False)

# Display the sorted frequent itemsets as a table without row indices
display(HTML(frequent_itemsets_sorted.to_html(index=False)))  # Displays the DataFrame as a table without index


###**Step 3:** Analysis based on their support, confidence, and lift.

In [None]:
from mlxtend.frequent_patterns import association_rules

# Assuming 'frequent_itemsets' is already defined as in your previous message
# Now, let's generate the association rules
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=.5)

# Calculate lift
rules["lift"] = rules["confidence"] / (rules["consequent support"] / rules["antecedent support"])

# Sort the rules by the lift and then by confidence in descending order
rules = rules.sort_values(['lift', 'confidence'], ascending=[False, False])


# Sort and display rules
rules_sorted = rules.sort_values(['lift', 'confidence'], ascending=[False, False])
display(HTML(rules_sorted.to_html(index=False)))
#print(rules)

  and should_run_async(code)


antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
"(celebrities, funny video)","(humor, jokes)",0.019882,0.019459,0.019459,0.978723,1.0,0.019072,46.085448,1.0
"(celebrities, humor)","(jokes, funny video)",0.020305,0.019459,0.019459,0.958333,1.0,0.019063,23.532995,1.0
"(humor, funny video)","(celebrities, jokes)",0.020305,0.019459,0.019459,0.958333,1.0,0.019063,23.532995,1.0
(funny video),"(celebrities, jokes)",0.022843,0.019459,0.019459,0.851852,1.0,0.019014,6.618655,1.0
(funny video),"(humor, jokes)",0.022843,0.019459,0.019459,0.851852,1.0,0.019014,6.618655,1.0
(funny video),"(celebrities, humor, jokes)",0.022843,0.019459,0.019459,0.851852,1.0,0.019014,6.618655,1.0
(celebrities),"(jokes, funny video)",0.024958,0.019459,0.019459,0.779661,1.0,0.018973,4.45015,1.0
(celebrities),"(humor, jokes)",0.024958,0.019459,0.019459,0.779661,1.0,0.018973,4.45015,1.0
(celebrities),"(humor, jokes, funny video)",0.024958,0.019459,0.019459,0.779661,1.0,0.018973,4.45015,1.0
(humor),"(celebrities, funny video)",0.038917,0.019882,0.019882,0.51087,1.0,0.019108,2.003798,1.0


# Apriori alg. on video tags based on season - Silvestre

## **Step 1:** Create dfs for fall and winter

In [None]:
if 'df_us_videos' in locals():
    # Creating two dictionaries based on 'season' column for Fall and Winter
    fall_video_tags_dict = {row['video_id']: row['tags'].split('|') for index, row in df_us_videos.iterrows() if isinstance(row['tags'], str) and row['season'] == 'Fall'}
    winter_video_tags_dict = {row['video_id']: row['tags'].split('|') for index, row in df_us_videos.iterrows() if isinstance(row['tags'], str) and row['season'] == 'Winter'}

    # Optionally, display a small part of each dictionary to check
    fall_sample_items = list(fall_video_tags_dict.items())[:5]
    winter_sample_items = list(winter_video_tags_dict.items())[:5]

    print("Fall Season Samples:")
    for video_id, tags in fall_sample_items:
        print(f"Video ID: {video_id}\nTags: {tags}\n")

    print("Winter Season Samples:")
    for video_id, tags in winter_sample_items:
        print(f"Video ID: {video_id}\nTags: {tags}\n")
else:
    print("Dataframe is not defined due to an error in reading the file.")

## **Step 2:** Apply Aprori on Winter and Fall

In [None]:
encoder = TransactionEncoder()
fall_tags = list(fall_video_tags_dict.values())
winter_tags = list(winter_video_tags_dict.values())

all_tags = fall_tags + winter_tags  # Combine all tags for a consistent fit
encoder.fit(all_tags)  # Fit the encoder to all tags once

fall_transformed_data = encoder.transform(fall_tags)  # Only transform here
winter_transformed_data = encoder.transform(winter_tags)  # Only transform here

fall_df = pd.DataFrame(fall_transformed_data, columns=encoder.columns_)
winter_df = pd.DataFrame(winter_transformed_data, columns=encoder.columns_)

fall_frequent_itemsets = apriori(fall_df, min_support=0.019, use_colnames=True)
winter_frequent_itemsets = apriori(winter_df, min_support=0.019, use_colnames=True)

fall_frequent_itemsets_sorted = fall_frequent_itemsets.sort_values(by='support', ascending=False)
winter_frequent_itemsets_sorted = winter_frequent_itemsets.sort_values(by='support', ascending=False)

  and should_run_async(code)


## **Step 3:** Display patterns

In [None]:
print("Fall Frequent Itemsets:")
display(HTML(fall_frequent_itemsets_sorted.to_html(index=False)))

In [None]:
print("Winter Frequent Itemsets:")
display(HTML(winter_frequent_itemsets_sorted.to_html(index=False)))

# Apriori alg on video all titles - Silvestre

## Preprocessing video titles (removing stopingwords)

In [None]:
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords

# Downloading stop words
nltk.download('stopwords')

# Set of English stop words
stop_words = set(stopwords.words('english'))
def clean_title(title):
    # Remove punctuation from the title
    punctuation_extended = string.punctuation + '–—'
    translator = str.maketrans('', '', punctuation_extended)
    title_no_punctuation = title.translate(translator)

    # Split the title into words, remove stop words
    words = [word for word in title_no_punctuation.lower().split() if word not in stop_words]

    return words

if 'df_us_videos' in locals():
    # Creating a dictionary with 'video_id' as keys and cleaned titles as values
    video_titles_dict = {row['video_id']: clean_title(row['title']) for index, row in df_us_videos.iterrows() if isinstance(row['title'], str)}


    # Optionally, display a small part of the dictionary to check
    sample_items = list(video_titles_dict.items())[:5]
    for video_id, title in sample_items:
        print(f"Video ID: {video_id}\nTitle: {title}\n")
else:
    print("Dataframe is not defined due to an error in reading the file.")

## applying Apriori alg.

In [None]:
video_titles = list(video_titles_dict.values())

import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from IPython.display import display

# Example data setup (assuming video_titles_dict is defined)
# video_titles = list(video_titles_dict.values())

# Applying TransactionEncoder
encoder = TransactionEncoder()
transformed_data = encoder.fit_transform(video_titles)
df_titles = pd.DataFrame(transformed_data, columns=encoder.columns_)

# Apply the Apriori algorithm to find frequent itemsets
frequent_itemsets = apriori(df_titles, min_support=0.0125, use_colnames=True)

# Sort by 'support' in descending order
sorted_frequent_itemsets = frequent_itemsets.sort_values(by='support', ascending=False)

# Set the display options
pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', None)  # Use maximum window width
pd.set_option('display.max_colwidth', None)  # Display the full width of each column

# Display the sorted frequent itemsets
display(sorted_frequent_itemsets)

##Generate Word cload based on frquent words

In [None]:
def wordcld(a):
    from wordcloud import WordCloud
    import matplotlib.pyplot as plt
    title_words = list(a["title"].apply(lambda x: x.split()))
    title_words = [x for y in title_words for x in y]
    #print(df_name_list[j])
    wc = WordCloud(width=1200, height=500,
                                collocations=False, background_color="white",
                                colormap="tab20b").generate(" ".join(title_words))
    plt.figure(figsize=(15,10))
    plt.imshow(wc, interpolation='bilinear')
    #plt.axis("off")
    plt.ylabel("US videos")



wordcld(df)