<a href="https://colab.research.google.com/github/claudiaxpreda/YouPropose/blob/main/data_preprocess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [158]:
import os
import pandas as pd
import json
import nltk
import numpy as np


import re
import string                             

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer 



nltk.download('wordnet')
nltk.download('stopwords')

from nltk.corpus import stopwords

from scipy.sparse import coo_matrix
from sklearn.feature_extraction.text import TfidfVectorizer



WORKDIR = os.getcwd() + '/drive/MyDrive/SAC'
US_PATH = WORKDIR + '/USvideos.csv'
GB_PATH = WORKDIR + '/GBvideos.csv'
CA_PATH = WORKDIR + '/CAvideos.csv'
DE_PATH = WORKDIR + '/DEvideos.csv'
FR_PATH = WORKDIR + '/FRvideos.csv'
IN_PATH = WORKDIR + '/INvideos.csv'
JP_PATH = WORKDIR + '/JPvideos.csv'
KR_PATH = WORKDIR + '/KRvideos.csv'
MX_PATH = WORKDIR + '/MXvideos.csv'
RU_PATH = WORKDIR + '/RUvideos.csv'

US_PATH_CAT = WORKDIR + '/US_category_id.json'
GB_PATH_CAT = WORKDIR + '/GB_category_id.json'
CA_PATH_CAT = WORKDIR + '/CA_category_id.json'
DE_PATH_CAT = WORKDIR + '/DE_category_id.json'
FR_PATH_CAT = WORKDIR + '/FR_category_id.json'
IN_PATH_CAT = WORKDIR + '/IN_category_id.json'
JP_PATH_CAT = WORKDIR + '/JP_category_id.json'
KR_PATH_CAT = WORKDIR + '/KR_category_id.json'
MX_PATH_CAT = WORKDIR + '/MX_category_id.json'
RU_PATH_CAT = WORKDIR + '/RU_category_id.json'

new_words = ["youtube","video","channel","link", 
             "also", "always","one", "two", "three",
             "four", "five", "seven","eight","nine", "ten",
            "month", "today", "tomorrow", "comment", "like", "dislike", "tonight", "subscribe", "share",
            "twitter", "snapchat", "instagram", "facebook"]



def get_path_country(code):
  path = WORKDIR + '/{}videos.csv'.format(code)
  path_cat = WORKDIR + '/{}_category_id.json'.format(code)

  return (path, path_cat)

def get_category_dict(CAT_PATH):
  with open(CAT_PATH) as f:
    data_json = json.load(f)["items"]
  
  title_dict = {}
  
  for cat in data_json:
    title_dict[int(cat["id"])] = cat["snippet"]["title"]

  return title_dict

def fill_null_values(index, value, data):
  data[index] = data[index].fillna(value)


def get_data(PATH, CAT_PATH):
  data = pd.read_csv(PATH, encoding='utf-8')
  data = data.drop(['comment_count', 'publish_time',
              'thumbnail_link', 'comments_disabled',
              'ratings_disabled'], 1)
  
  
  title_dict = get_category_dict(CAT_PATH)
  data["category_id"] = data["category_id"].map(title_dict)

  fill_null_values('category_id', 'Other', data)
  fill_null_values('description', 'NoDescription', data)

  data = data[data.likes >= 1000]
  data = data[data.views >= 1000]
  

  data['tags'] = np.where(data['tags'] == '[none]', data['title'], data['tags'])

  data = data.drop_duplicates(['video_id'])
  data = data.reset_index(drop=True)

  return data
  


def clean_data(data):

  html_tag=re.compile(r'<.*?>')
  data=html_tag.sub(r'',data)

  others_tag = re.compile(r'&lt;/?.*?&gt;"," &lt;&gt; ')
  data = others_tag.sub(r'', data)

  hash_tag = re.compile(r'/#\w+\s*/')
  data = hash_tag.sub(r'', data)

  integer_tag = re.compile(r'$\d+\W+|\b\d+\b|\W+\d+$')
  data = integer_tag.sub(r'', data)

  links_tag = 'http\S+'
  data = re.sub(links_tag, '',data , flags=re.MULTILINE)

  spaces_clean= re.compile(r'\s\s+')
  data=spaces_clean.sub(r'',data)

  emoji_clean= re.compile("["
                          u"\U0001F600-\U0001F64F"  # emoticons
                          u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                          u"\U0001F680-\U0001F6FF"  # transport & map symbols
                          u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                          u"\U00002702-\U000027B0"
                          u"\U000024C2-\U0001F251"
                          "]+", flags=re.UNICODE)
  data=emoji_clean.sub(r'',data)

  data = data.replace('"', "")


  return data

def apply_clean_data(data, tag, oldtag):
  data[tag] = data[oldtag].apply(lambda z: clean_data(z))
  data[tag] = data[tag].str.lower()

   
def lemm_data(comment):
  lemm = WordNetLemmatizer()
  lemm_words =[lemm.lemmatize(word).lower() for word in comment] 
  return lemm_words

def apply_lemm_data(data, tag):
  data[tag]=data[tag].apply(lambda z: lemm_data(z))

def lemm_description(data, tag):
  lemm = WordNetLemmatizer()
  data[tag] = data[tag].apply(
      lambda z : ' '.join([lemm.lemmatize(word).lower() for word in z.split()])) 


def split_by_mark(data, tag, mark):
  data[tag] = data[tag].astype(str)
  data[tag] = data[tag].str.split(mark)
  
def clean_tags(data):
  apply_clean_data(data, "clean_t", 'tags')
  split_by_mark(data, "clean_t", '|')
  apply_lemm_data(data, "clean_t")


def clean_text(text):
  text = re.sub("https*\S+", " ", text)
  text = re.sub("@\S+", " ", text)
  text = re.sub("#\S+", " ", text)
  text = re.sub("\'\w+", " ", text)
  text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
  text = re.sub(r'\w*\d+\w*', '', text)
  text = re.sub('\s{2,}', " ", text)
  emoji_clean = re.compile("["
                    u"\U0001F600-\U0001F64F"  # emoticons
                    u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                    u"\U0001F680-\U0001F6FF"  # transport & map symbols
                    u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                    u"\U00002702-\U000027B0"
                    u"\U000024C2-\U0001F251"
                    "]+", flags=re.UNICODE)
  text = emoji_clean.sub(r'', text)

  return text


def apply_clean_text(data, tag):
  data[tag] = data[tag].apply(lambda z: clean_text(z))

def clean_description(data):
  data['clean_d'] = data['description'].apply(
      lambda z: z.replace("-", " "))
  # apply_clean_data(data, 'description')
  data['clean_d'] = data['clean_d'].apply(
      lambda z: z.replace("\\n\\n", "\n"))
  data['clean_d'] = data['clean_d'].apply(
      lambda z: z.replace("\\n", "\n"))
  apply_clean_text(data, 'clean_d')
  lemm_description(data, 'clean_d')

def clean_title():
  pass

def get_stop_list(language):
  stoplist = list(string.punctuation)
  stoplist += stopwords.words(language)
  stoplist += new_words
  return stoplist


def get_tf_idf_keywords(stoplist, data):
  tf_idf = TfidfVectorizer(max_df=0.8,stop_words=stoplist, max_features=10000, ngram_range=(1,2))
  tf_idf.fit(data.clean_d)
  return tf_idf

def get_keywords(entry, tf_idf):
  doc = pd.Series(entry)
  doc_vector = tf_idf.transform(doc)
  sorted_items=sort_coo(doc_vector.tocoo())
  feature_names = tf_idf.get_feature_names()
  keywords=extract_topn_from_vector(feature_names,sorted_items,15)
  keywords = list(keywords.keys())
  return keywords

def apply_get_keywords(data, language):
  stoplist = get_stop_list(language)
  tf_idf = get_tf_idf_keywords(stoplist, data)
  data['keywords'] = data['clean_d'].apply(lambda z : get_keywords(z,tf_idf))
  

# apply_get_keywords(us_data)


def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
 
def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]
    score_vals = []
    feature_vals = []
    
    # word index and corresponding tf-idf score
    for idx, score in sorted_items:
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])

    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results


def all_words(data):
  data['all'] = data['keywords'] + data['clean_t']


def join_words(data):
  new_list = []
  for w in  data:
    if len(w) > 1:
      new_list.append("".join(w.split()))
    else:
      new_list.append(w)
  
  return new_list

def apply_join(data):
  all_words(data)
  data['all'] = data['all'].apply(lambda z: join_words(z))
  data['all'] = data['all'].apply(lambda z: " ".join(z))
  data['channel_title'] = data['channel_title'].str.lower()
  data["channel_title"].apply(lambda z: " ".join(z))
  data['all'] = data['all'] + " " + data['category_id'].str.lower() + " " + data['channel_title']


def get_clean_data_final(code, language):
  path, path_cat = get_path_country(code)
  data =  get_data(path, path_cat)
  print(data.size)
  clean_description(data)
  clean_tags(data)
  apply_get_keywords(data, language)
  apply_join(data)
  return data

def save_clean_data(code, language, path):
  data = get_clean_data_final(code, language)
  # data.to_json('/content/drive/MyDrive/SAC/US_final.json')
  data.to_json(path)

def clean_entry_data(path, cat_path):
  #data_e = pd.read_json(path)
  with open("/content/drive/MyDrive/SAC/SAC_new_data.json") as f:
    data_json = json.load(f)['videos']
  
  data = pd.DataFrame(data_json)

  title_dict = get_category_dict(cat_path)
  
  data["category_id"] = data["categoryId"].map(title_dict)

  fill_null_values('category_id', 'Other', data)
  fill_null_values('description', 'NoDescription', data)

  data['tags'] = data['tags'].apply(lambda z : "|".join(z))
  data['tags'] = np.where(data['tags'] == '[none]', data['title'], data['tags'])
  data['channel_title'] = data['channelName']
  data['video_id'] = data['id']


  print(data.size)
  clean_description(data)
  clean_tags(data)
  apply_get_keywords(data, 'english')
  apply_join(data)

  return data

def test():
  data = get_clean_data_final('US', 'english')
  data.head()
  return data 

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [38]:
dat = test()

59367


In [189]:
dat['all'].iloc[1]
dat.to_json('/content/drive/MyDrive/SAC/US-final-2.json')

In [None]:
import pandas as pd
data_1 = pd.read_json('/content/drive/MyDrive/SAC/US-final-2.json')
data_1 = data_1.drop_duplicates(['video_id'])
data_1.head()

In [213]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def get_tf_idf_cosine(data, stopwords):
  tfv = TfidfVectorizer(
      max_features = None,
      strip_accents = "unicode",
      analyzer = "word",
      token_pattern = r'\w{1,}',
      ngram_range = (1, 2),         # Taking combinations of 1-3 different kind of words
      stop_words = stopwords        # Remove the unnecessary stopword characters
  )
  tfv_matrix = tfv.fit_transform(data['all'])   # => Sparse Matrix(vectors) => most of the values in matrix = 0
  return tfv_matrix

def get_matrix(tfv_matrix):
  cosine_sim = cosine_similarity(tfv_matrix, tfv_matrix)
  return cosine_sim

def recommendations(video_id, data, cosine_sim, indices):
    recommended_videos = []
    idx = indices[video_id]
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)
    top_10_indexes = list(score_series.iloc[1:6].index)
    for i in top_10_indexes:
        # recommended_movies.append(list(df_cleaned.title)[i] + "--- Labels ---- " + list(df_cleaned.video_id)[i])
        recommended_videos.append(list(data.video_id)[i])
        
    return recommended_videos

def get_indices(data):
  df_cleaned = data[['video_id', 'title', 'all', 'category_id']]
  df_cleaned = df_cleaned.drop_duplicates(['video_id'])
  df_cleaned = df_cleaned.reset_index(drop=True)
  indices = pd.Series(df_cleaned.index, index=df_cleaned['video_id'])
  return df_cleaned, indices

def post():
  pass

def get_api():
  path = '/content/drive/MyDrive/SAC/US_category_id.json'

  data  = clean_entry_data('', path)
  data_1 = pd.read_json('/content/drive/MyDrive/SAC/CA-final.json')
  len =  data_1.size if data_1.size < 30000 else 30000
  datac = pd.concat([data, data_1[:len]])
  new_data, indices = get_indices(datac)
  tfv_matrix = get_tf_idf_cosine(new_data, get_stop_list('english'))
  cosine_sim= get_matrix(tfv_matrix)

  result = []
  for id in data['video_id']:
    result = result + recommendations(id, new_data, cosine_sim, indices)

  return result
  




In [214]:
print(get_api())

result = get_api()
mylist = list( dict.fromkeys(result) )
print(mylist) 

60
['lp-EO5I60KA', '2Vv-BfVoq4g', 'iWZmdoY1aTE', 'UDDMYw_IZnE', '817P8W8-mGE', 'PNCmrJI743E', 'D-nqqawsZfI', 'OMoY0SoP_b0', 'ZKchfFiHje0', '9pqaQvfHK-k', 'rsEne1ZiQrk', 'M4ZoCHID9GI', 'xnDHnm4jcMc', 'XR7Ev14vUh8', 'jnQ4V-wajLY', 'DDbx1uArVOM', 'bS9zXmexXUQ', 'h0zMjd5ZAJ4', 'aLRQLkqjHEg', 'KCCbbvAKxoc', 'JGwWNGJdvx8', '2Vv-BfVoq4g', 'iWZmdoY1aTE', '817P8W8-mGE', 'UDDMYw_IZnE']
60
['lp-EO5I60KA', '2Vv-BfVoq4g', 'iWZmdoY1aTE', 'UDDMYw_IZnE', '817P8W8-mGE', 'PNCmrJI743E', 'D-nqqawsZfI', 'OMoY0SoP_b0', 'ZKchfFiHje0', '9pqaQvfHK-k', 'rsEne1ZiQrk', 'M4ZoCHID9GI', 'xnDHnm4jcMc', 'XR7Ev14vUh8', 'jnQ4V-wajLY', 'DDbx1uArVOM', 'bS9zXmexXUQ', 'h0zMjd5ZAJ4', 'aLRQLkqjHEg', 'KCCbbvAKxoc', 'JGwWNGJdvx8']


In [208]:
print(get_api())
result = get_api()

60
                                               title  ... video_error_or_removed
0   Ed Sheeran - Shape of You (Official Music Video)  ...                    NaN
1  Biden: Health care orders undo the damage Trum...  ...                    NaN
2            The Weeknd - The Hills (Official Video)  ...                    NaN
3  Joe Biden 2021 Presidential Inauguration Ceremony  ...                    NaN
4  Ed Sheeran - Thinking Out Loud (Official Music...  ...                    NaN

[5 rows x 21 columns]
['lp-EO5I60KA', 'ZwvbQR887W0', 'yzTuBuRdAyA', 'I-QOOx_K9V0', 'ZwvbQR887W0', 'lp-EO5I60KA', 'yzTuBuRdAyA', 'JGwWNGJdvx8', 'lp-EO5I60KA', 'ZwvbQR887W0', 'I-QOOx_K9V0', 'JGwWNGJdvx8', 'I-QOOx_K9V0', 'lp-EO5I60KA', 'yzTuBuRdAyA', 'JGwWNGJdvx8', 'JGwWNGJdvx8', 'ZwvbQR887W0', 'yzTuBuRdAyA', 'I-QOOx_K9V0']
60
                                               title  ... video_error_or_removed
0   Ed Sheeran - Shape of You (Official Music Video)  ...                    NaN
1  Biden: Health care 

In [207]:
for i in mylist:
  print(datac[datac['video_id'] == i][['title','video_id']])

                                               title     video_id
4  Ed Sheeran - Thinking Out Loud (Official Music...  lp-EO5I60KA
                                               title     video_id
3  Joe Biden 2021 Presidential Inauguration Ceremony  ZwvbQR887W0
                                     title     video_id
2  The Weeknd - The Hills (Official Video)  yzTuBuRdAyA
                                               title     video_id
1  Biden: Health care orders undo the damage Trum...  I-QOOx_K9V0
                                              title     video_id
0  Ed Sheeran - Shape of You (Official Music Video)  JGwWNGJdvx8
