# import libraries and download other things we'll be using

In [15]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from rake_nltk import Rake

In [3]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HexMa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HexMa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# read from csv

In [10]:
data = pd.read_csv("test_data_sentiment_output.csv").head(500)
data.head()

Unnamed: 0,id,roberta_neg,roberta_neu,roberta_pos,snippet.textOrginal,snippet.likeCount,snippet.publishedAt
0,UgzzkGXp9OqcNTIfg8N4AaABAg,0.001375,0.111756,0.886869,"Yep, 42 minutes of video. This was a big proje...",29165,2020-03-14T14:07:53Z
1,UgxHwgzeFMX03Ls9YoJ4AaABAg,0.573469,0.400585,0.025947,"Would like to see a follow-up on this take, no...",0,2023-09-21T19:24:12Z
2,UgyHNNyAj-FS9Ja1rSt4AaABAg,0.012195,0.274506,0.7133,"""The UK has one of the world's friendliest cop...",1,2023-09-21T17:01:28Z
3,Ugz2llq0yWm-fPYxguJ4AaABAg,0.162383,0.699382,0.138236,My take on the whole giphy being transformativ...,1,2023-09-19T15:18:44Z
4,UgyOKH4bXfPQ9zKU06t4AaABAg,0.62473,0.360903,0.014367,"""Fact checking"" is a modern euphemism for cens...",0,2023-09-18T15:37:31Z


In [13]:
sorted_data = data.query('roberta_neg > 0.5').sort_values('roberta_neg', ascending=False).reset_index(drop=True)
sorted_data.head()

Unnamed: 0,id,roberta_neg,roberta_neu,roberta_pos,snippet.textOrginal,snippet.likeCount,snippet.publishedAt
0,UgwTBqgqNsagmYITOmx4AaABAg,0.974781,0.023283,0.001936,14:31 Are we really going to use TheQuarterInc...,2,2023-06-14T18:30:42Z
1,UgxW5M6fajGE0qkMShl4AaABAg,0.974346,0.018809,0.006845,I HATE IT I HATE,0,2023-07-19T14:10:04Z
2,UgxUVeTsrnf9uFaKP554AaABAg,0.974281,0.023191,0.002528,"It's real bad what is done with people, chumps.",0,2023-06-15T23:52:03Z
3,UgyVoTBVzjwCOfMg8kJ4AaABAg,0.971271,0.024506,0.004222,This is so lame,0,2022-09-19T11:25:39Z
4,UgyeQVEAxjmR1K92i054AaABAg,0.961498,0.034038,0.004464,I hate when everything is under license. You c...,1,2022-10-05T03:47:41Z


# using rake

In [88]:
r = Rake()
r.extract_keywords_from_text(data["snippet.textOrginal"][0])
r.get_ranked_phrases_with_scores()

[(25.0, 'part original series called money'),
 (8.0, 'https :// curiositystream'),
 (4.0, 'new five'),
 (4.0, 'big project'),
 (4.0, '42 minutes'),
 (2.0, 'curiositystream'),
 (1.0, 'yep'),
 (1.0, 'watch'),
 (1.0, 'video'),
 (1.0, 'tomscott'),
 (1.0, 'plus'),
 (1.0, 'nebula'),
 (1.0, 'got'),
 (1.0, 'com'),
 (1.0, 'bundle')]

In [20]:
stop_words = set(stopwords.words('english'))
r2 = Rake(stopwords=stop_words)
r.extract_keywords_from_sentences(sorted_data["snippet.textOrginal"])
r.get_ranked_phrases_with_scores()

[(45.13690476190476, 'old money gets extra special super duper rights'),
 (39.85714285714286, '55 yooooooooo legal eagle ???? last place'),
 (28.033333333333335, 'abused via 2 clicks without consequences'),
 (26.51818181818182, 'new youtube ceo jerk mcjerk face'),
 (24.5, 'mystery science theater 3000 reference'),
 (20.333333333333332, 'worlds biggest stockholm syndrom case'),
 (20.333333333333332, 'cnn using allen pan ’'),
 (20.31818181818182, 'newest latest youtube algorithm detects'),
 (19.8, 'third parties )." intellectual property'),
 (18.57142857142857, 'footage … yet every second'),
 (18.545454545454547, 'related companies within 90 days'),
 (17.365163572060123, 'shakespeare copyright ltd ." would'),
 (17.357142857142858, 'released 3 years later 💀'),
 (17.011204481792717, 'normal basis get 20 years'),
 (16.90909090909091, 'music scene sonny bono used'),
 (16.5, 'demolished man reference threw'),
 (16.485714285714288, 'current patent laws limit protection'),
 (16.0, 'age well sin

# using sklearn

In [18]:
# import scikit-learn
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

## get word counts

In [60]:
def set_text_to_lower(text_list):
    output_list = []
    for text in text_list:
        output_list.append(text.lower())
    return output_list

In [80]:
# max_df means if a word appears in more than 90 percent of the document, it will be ignored
# min_df means if a word needs to appear at least twice need to be tracked
cv = CountVectorizer(stop_words="english", max_df=0.9, min_df=2, strip_accents='ascii')
# counts word use and stores them in a matrix
word_counts = cv.fit_transform(sorted_data["snippet.textOrginal"])
# word_counts has one row for each document and one column for each word
# (0, 5) 1 
# means WORD 5 appears in DOCUMENT 0 1 time
print(word_counts[0])

  (0, 5)	1
  (0, 218)	1
  (0, 122)	1
  (0, 279)	1
  (0, 102)	1
  (0, 183)	1
  (0, 297)	1
  (0, 34)	1
  (0, 158)	1


In [82]:
features = cv.get_feature_names_out()
features[:20]

array(['000', '10', '100', '11', '12', '14', '15', '20', '2022', '40',
       '50', '70', '90', 'able', 'abolish', 'absolutely', 'abuse',
       'access', 'accountable', 'actual'], dtype=object)

## tfidf

In [85]:
def get_keywords(text):
    transformer = TfidfTransformer()
    transformer.fit(word_counts)
    # get the word count vector for the first comment
    word_count_vector = cv.transform(text)
    # create tfidf vector
    tfidf_vector = transformer.transform(word_count_vector).tocoo()
    tuples = zip(tfidf_vector.row, tfidf_vector.col, tfidf_vector.data)
    tfidf_vector = sorted(tuples, key=lambda x: x[2], reverse=True)
    # return top twenty keywords
    top_keywords = tfidf_vector[:20]
    print(top_keywords[0])
    for tup in top_keywords:
        print(features[tup[1]], tup[2])

In [86]:
get_keywords(sorted_data['snippet.textOrginal'])

(1, 128, 1.0)
hate 1.0
weird 1.0
wrong 1.0
start 1.0
broken 1.0
far 1.0
paying 1.0
world 1.0
15 1.0
sue 1.0
copywrite 1.0
sucks 0.9213323524476182
balls 0.8694454895900886
using 0.8694454895900886
theft 0.8531337171139333
instead 0.8060631985835407
bad 0.7913200827634193
reference 0.7692608139927108
reference 0.7576585684164264
evil 0.7440852152829044
