In [None]:
#
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [None]:
# for keyword extraction
!pip install rake-nltk

Collecting rake-nltk
  Downloading rake_nltk-1.0.6-py3-none-any.whl (9.1 kB)
Installing collected packages: rake-nltk
Successfully installed rake-nltk-1.0.6


In [None]:
#
from google.colab import drive
drive.mount('/content/drive')

# Import the Excel file from Google Drive.
file_path = '/content/drive/MyDrive/Data_Science_springboard/Unit_30/'

file_name ='amazon_reviews_nlp.csv'
# Create a Pandas DataFrame from the csv file.
df = pd.read_csv(file_path + file_name)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Print the first 5 rows of the DataFrame     df.dtypes
#print(df.head())

df.drop('Unnamed: 5', axis = 1)

Unnamed: 0,RowNum,ID,Sentiment,Category,Review
0,1,0,5,Amazon Devices & Accessories,This 3rd Generation FireTV Stick now comes wit...
1,4,0,5,Amazon Devices & Accessories,***EDIT 05/31/2021 - READ WAY BELOW! AMAZON FI...
2,7,0,5,Amazon Devices & Accessories,All I needed was the remote. My offspring kill...
3,10,0,5,Amazon Devices & Accessories,I gave up cable for the obvious reasons. Skyro...
4,13,0,4,Amazon Devices & Accessories,"Tried repeatedly to get to work, spent several..."
...,...,...,...,...,...
10672,16,1315,5,Video Games,Finally I found a way to pay early. I will con...
10673,19,1315,5,Video Games,I love prime deals and I love Xbox gold and ga...
10674,22,1315,5,Video Games,These are normally $44 for three months. Durin...
10675,25,1315,5,Video Games,I usually buy this when there are deals on the...


In [None]:
#Next, we preprocess the data. This can involve removing stop words, converting all text to lower case, and lemmatizing words.
nltk.download('stopwords') # stopwords are a list of common words that are often removed from text
nltk.download('wordnet')
nltk.download('punkt') # late add due to error when calling function below
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# split the input text into individual words, or "tokens".
#
def preprocess_text(text):
    if pd.isnull(text):  # Check if text is NaN
        return ''
    text = str(text).lower()  # Convert text to string before lowercasing
    text = nltk.word_tokenize(text)
    text = [lemmatizer.lemmatize(word) for word in text if word not in stop_words]
    return ' '.join(text)



In [None]:
df['Review'] = df['Review'].apply(preprocess_text)
print(df['Review'])

0        3rd generation firetv stick come 3rd generatio...
1        * * * edit 05/31/2021 - read way ! amazon fire...
2        needed remote . offspring killed old fire stic...
3        gave cable obvious reason . skyrocket high cos...
4        tried repeatedly get work , spent several hour...
                               ...                        
10672    finally found way pay early . continue use use...
10673    love prime deal love xbox gold game pas . litt...
10674    normally $ 44 three month . time around black ...
10675    usually buy deal card , cheaper $ 15 . even $ ...
10676    work , got two 3 month digital code subscripti...
Name: Review, Length: 10677, dtype: object


In [None]:
# Next, we vectorize our text data using TF-IDF and split the dataset into a training set and a test set.
vectorizer = TfidfVectorizer(max_features=2500, min_df=7, max_df=0.8)
processed_features = vectorizer.fit_transform(df['Review']).toarray()


In [None]:
#
X_train, X_test, y_train, y_test = train_test_split(processed_features, df['Sentiment'], test_size=0.2, random_state=0)

In [None]:
#Finally, we can train a Random Forest Classifier on our training data and evaluate its performance on our test data.
classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(X_train, y_train)
predictions = classifier.predict(X_test)

In [None]:
#
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))
print(accuracy_score(y_test, predictions))


[[134   0   0 ...   0   0   0]
 [  2   9   0 ...   0   0   0]
 [  1   0  15 ...   0   0   0]
 ...
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]]
              precision    recall  f1-score   support

           1       0.93      0.41      0.57       323
           2       1.00      0.14      0.24        65
           3       0.83      0.20      0.32        76
           4       1.00      0.17      0.29        83
           5       0.74      1.00      0.85      1440
         170       0.00      0.00      0.00         3
         171       0.00      0.00      0.00         2
         173       0.00      0.00      0.00         1
         174       0.00      0.00      0.00         2
         175       0.00      0.00      0.00         4
         176       0.00      0.00      0.00         2
         177       0.00      0.00      0.00         2
         179       0.00      0.00      0.00         3
         180       0.00      0.00      0.00        

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
from gensim.summarization import keywords    # not run

def extract_keywords(text, ratio=0.1):
    return keywords(text, ratio=ratio).split('\n')

#text = "Artificial intelligence (AI) refers to the simulation of human intelligence in machines that are programmed to think like humans and mimic their actions."
text =df['Review']
key_words = extract_keywords(text)

print(key_words)


ModuleNotFoundError: ignored

In [None]:
#
#!pip install rake-nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
print(df['Review'])

0        3rd generation firetv stick come 3rd generatio...
1        * * * edit 05/31/2021 - read way ! amazon fire...
2        needed remote . offspring killed old fire stic...
3        gave cable obvious reason . skyrocket high cos...
4        tried repeatedly get work , spent several hour...
                               ...                        
10672    finally found way pay early . continue use use...
10673    love prime deal love xbox gold game pas . litt...
10674    normally $ 44 three month . time around black ...
10675    usually buy deal card , cheaper $ 15 . even $ ...
10676    work , got two 3 month digital code subscripti...
Name: Review, Length: 10677, dtype: object


In [None]:
from rake_nltk import Rake

def extract_keywords_rake(text):
    r = Rake()
    r.extract_keywords_from_text(text)
    return r.get_ranked_phrases()

#text = "Artificial intelligence (AI) refers to the simulation of human intelligence in machines that are programmed to think like humans and mimic their actions."
text = df['Review']
df['keywords'] = df['Review'].apply(extract_keywords_rake)
#key_words = extract_keywords_rake(text)

#print(key_words)
print(df['keywords'])

0        [might want wait grab firetv stick 4k version ...
1        [april security ... chromecast android tv andr...
2        [offspring killed old fire stick remote point ...
3        [gripe far peacock ’ app absolutely ’ seen dev...
4        [made arrangement speak got home asked couple ...
                               ...                        
10672    [finally found way pay early, continue use use...
10673    [little tricky figure finally realized needed ...
10674    [plus lot benefit ultimate membership believe ...
10675    [usually spend time trying finish 2 3 game wit...
10676    [com page offered 1 additional xbox game pas u...
Name: keywords, Length: 10677, dtype: object


In [None]:
#  'Category' now back in
df2 = df[['ID', 'Category', 'Sentiment', 'keywords', 'Review' ]]  #
df2.to_csv(file_path + 'df_categories.csv')

In [None]:
#create an empty dataframe,
dfKW =pd.DataFrame(columns=['ID', 'Sentiment', 'keywords'])

#loop through the original dataframe and append all keywords to the new dataframe with same id
for i, row in df.iterrows():
    kw = row['keywords']
    for j in range(len(kw)):
        dfKW = dfKW.append({'ID': row['ID'], 'Sentiment': row['Sentiment'],
                   'keywords': kw[j],
                  }, ignore_index=True)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  dfKW = dfKW.append({'ID': row['ID'], 'Sentiment': row['Sentiment'],
  dfKW = dfKW.append({'ID': row['ID'], 'Sentiment': row['Sentiment'],
  dfKW = dfKW.append({'ID': row['ID'], 'Sentiment': row['Sentiment'],
  dfKW = dfKW.append({'ID': row['ID'], 'Sentiment': row['Sentiment'],
  dfKW = dfKW.append({'ID': row['ID'], 'Sentiment': row['Sentiment'],
  dfKW = dfKW.append({'ID': row['ID'], 'Sentiment': row['Sentiment'],
  dfKW = dfKW.append({'ID': row['ID'], 'Sentiment': row['Sentiment'],
  dfKW = dfKW.append({'ID': row['ID'], 'Sentiment': row['Sentiment'],
  dfKW = dfKW.append({'ID': row['ID'], 'Sentiment': row['Sentiment'],
  dfKW = dfKW.append({'ID': row['ID'], 'Sentiment': row['Sentiment'],
  dfKW = dfKW.append({'ID': row['ID'], 'Sentiment': row['Sentiment'],
  dfKW = dfKW.append({'ID': row['ID'], 'Sentiment': row['Sentiment'],
  dfKW = dfKW.append({'ID': row['ID'], 'Sentiment': row['Sentiment'],
  dfKW = dfKW.append({'ID

In [None]:
print(dfKW)
dfKW.to_csv(file_path + 'df_ID_Sentiments_keywords.csv')

          ID Sentiment                                           keywords
0          0         5  might want wait grab firetv stick 4k version e...
1          0         5  firetv stick 4k still shipping older 2nd gener...
2          0         5  support dolby atmos specification nice audioph...
3          0         5  currently appears though firetv stick 3rd gen ...
4          0         5  remote also allows control tv power volume but...
...      ...       ...                                                ...
161716  1315         5                     7 month xbox game pas ultimate
161717  1315         5                                          went xbox
161718  1315         5                                       thank amazon
161719  1315         5                                               work
161720  1315         5                                                 45

[161721 rows x 3 columns]


In [None]:
# Update keywords_list with dataframe df keywords
keywords_list = []

for row in df.itertuples():
    keywords_list.append(row.keywords)

In [None]:
#
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

## Join all phrases/sentences into one single string
## joined_keywords = [' '.join(keywords_list)]

#Convert each sublist of keywords_list into a single string, then join all into one single string
joined_keywords = [' '.join([' '.join(sublist) for sublist in keywords_list])]


# Create a matrix of TF-IDF features
tfidf = TfidfVectorizer().fit_transform(joined_keywords )

# Compute the cosine similarity of the first item in the list with the others
cosine_similarities = cosine_similarity(tfidf[0:1], tfidf).flatten()
most_similar = cosine_similarities.argsort()[:-5:-1]

print('Keywords most similar to the first one:')
for i in most_similar:
    print(keywords_list[i])

Keywords most similar to the first one:
['might want wait grab firetv stick 4k version eventually start shipping new gen 3 remote', 'firetv stick 4k still shipping older 2nd generation alexa voice remote currently though', 'support dolby atmos specification nice audiophiles home speaker system make use', 'currently appears though firetv stick 3rd gen device ship new remote', 'remote also allows control tv power volume button available firetv stick lite', '3rd generation firetv stick come 3rd generation alexa voice remote', 'new alexa voice remote 3rd generation recently released', 'dolby atmos – might big deal many people', 'one tv power button app button channel guide button', '” definitely tell difference navigating menu loading application', 'firetv stick 4k product exists want feature', 'biggest upgrade opinion coming 2nd generation device', 'mainly speed difference also newer remote nice', '10 le lose atmos support tv control', 'firetv cube currently shipping older remote', 'main 

In [None]:
# Count the number of words in each row:      alt version run
def count_words(df):
  for i in range(len(df)):
    df.loc[i, 'num_words'] = len(df.loc[i, 'keywords'].split())
  return df

df_ct_words = count_words(df.copy())
print(df_ct_words.head(25))


AttributeError: ignored

In [None]:
def count_words(df):
    for i in range(len(df)):
        df.loc[i, 'num_words'] = len(df.loc[i, 'keywords'])
    return df

df_ct_words = count_words(df.copy())
print(df_ct_words.head(25))

   RowNum  ID  Sentiment                      Category  \
0       1   0          5  Amazon Devices & Accessories   
1       4   0          5  Amazon Devices & Accessories   
2       7   0          5  Amazon Devices & Accessories   
3      10   0          5  Amazon Devices & Accessories   
4      13   0          4  Amazon Devices & Accessories   
5      16   0          2  Amazon Devices & Accessories   
6      19   0          4  Amazon Devices & Accessories   
7      22   0          1  Amazon Devices & Accessories   
8      25   0          1  Amazon Devices & Accessories   
9      28   0          5  Amazon Devices & Accessories   
10      1   2          5  Amazon Devices & Accessories   
11      4   2          5  Amazon Devices & Accessories   
12      7   2          5  Amazon Devices & Accessories   
13     10   2          1  Amazon Devices & Accessories   
14     13   2          1  Amazon Devices & Accessories   
15     16   2          1  Amazon Devices & Accessories   
16     19   2 

In [None]:
from collections import Counter  #testing alt function in next cell

def find_most_common_words(df):
    # Flatten the list of keywords after splitting by spaces
    all_words = [word for words in df['keywords'].str.split().dropna() for word in words]

    # Count occurrences of each word
    word_counter = Counter(all_words)

    # Convert counter object to a DataFrame
    most_common_words = pd.DataFrame(word_counter.most_common(10), columns=['word', 'count'])

    return most_common_words

most_common_words = find_most_common_words(df.copy())
print(most_common_words.head(15))


Empty DataFrame
Columns: [word, count]
Index: []


In [None]:
from collections import Counter

def find_most_common_words(df):
    # Flatten the list of keywords
    all_words = [word for words_list in df['keywords'].dropna() for word in words_list]

    # Count occurrences of each word
    word_counter = Counter(all_words)

    # Convert counter object to a DataFrame
    most_common_words = pd.DataFrame(word_counter.most_common(10), columns=['word', 'count'])

    return most_common_words

most_common_words = find_most_common_words(df.copy())
print(most_common_words)


      word  count
0        n   2017
1        5    734
2  however    474
3        1    453
4     ca n    349
5       ''    332
6        2    292
7      etc    285
8     also    282
9        3    278


In [None]:
#
def find_words_starting_with_letter(df, letter):
    # Flatten the list of lists in 'keywords' column into a single Series
    keywords_series = pd.Series([word for sublist in df['keywords'].dropna().tolist() for word in sublist])

    # Find and return the words that start with the specified letter
    words_starting_with_letter = keywords_series[keywords_series.str.startswith(letter)]

    return words_starting_with_letter

words_starting_with_letter = find_words_starting_with_letter(df.copy(), 'a')
print(words_starting_with_letter)


22               allowing control application tv experience
30                                               app button
47        april security ... chromecast android tv andro...
53        amazon firestick beat apple tv chromcast googl...
54        also returning shield keep something handle st...
                                ...                        
161611                                    also show mailbox
161618                                   annoyance negative
161671                                       also play halo
161695                       always least 15 big name title
161701                                  access hundred game
Length: 9111, dtype: object


In [None]:
def find_words_containing_substring(df, substring):
    # Flatten the list of lists in 'keywords' column into a single Series
    keywords_series = pd.Series([word for sublist in df['keywords'].dropna().tolist() for word in sublist])

    # Find and return the words that contain the specified substring
    words_containing_substring = keywords_series[keywords_series.str.contains(substring)]

    return words_containing_substring

words_containing_substring = find_words_containing_substring(df.copy(), 'erro')
print(words_containing_substring)

64                 made error earlier ethernet speed amazon
1794                                            throw error
6202      read lot negative review regard people issue p...
8388      rest setup process bit grind random error free...
8390             kept getting error error trying set iphone
                                ...                        
158198                                  error trying redeem
158459                  giving u error game card could read
158553             reproduce able error left right clicking
158815             bought second one see sort factory error
160858                                           user error
Length: 107, dtype: object


In [None]:
#
!pip install fuzzywuzzy
!pip install python-Levenshtein

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0
Collecting python-Levenshtein
  Downloading python_Levenshtein-0.21.1-py3-none-any.whl (9.4 kB)
Collecting Levenshtein==0.21.1 (from python-Levenshtein)
  Downloading Levenshtein-0.21.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (172 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m172.5/172.5 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rapidfuzz<4.0.0,>=2.3.0 (from Levenshtein==0.21.1->python-Levenshtein)
  Downloading rapidfuzz-3.1.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m109.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, Levenshtein, python-Levenshtein
Successfully installed Levenshtein-0.21.1 python-Levenshtein-0.21.1 rapid

In [None]:
#
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

# Two strings to compare
str1 = "easy to use"
str2 = "easy yo use"

# Simple Ratio
print(fuzz.ratio(str1.lower(), str2.lower()))

# Token Sort Ratio
print(fuzz.token_sort_ratio(str1, str2))

# Token Set Ratio
print(fuzz.token_set_ratio(str1, str2))

# Extract matches using process method
query = 'geeks for geeks'
choices = ['geek for geek', 'geek geek', 'g. for geeks']
print(process.extract(query, choices))

91
73
91
[('g. for geeks', 95), ('geek for geek', 93), ('geek geek', 86)]
