In [None]:
import json
import csv
import re
import sys
import pandas as pd
import numpy as np
import heapq

import gensim
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

#####################################################
#args = sys.argv
#if len(args) < 2:
#    print("You forgot something")
FILE_1 = 'music_dev'  # name of interim csv file. For example: # games_train
FILE_2 = 'games_val'  # name of comparison interim csv file. For example: # sew_train
FILE_NAME = 'games'
N_DIS = 100   # number of dissimilar embeddings to select
# python3 cosine.py 'music_dev' 'sew_val' 2

# Functions

In [None]:
def csv_loader(PATH):
    text = pd.read_csv(PATH, names=['review','sentiment']) 
    return text

# Load Training Data

In [None]:
# Load Interim CSV file and split into X and y
data_1 = csv_loader('../data/interim/' + FILE_1 + '.csv')
X_1, y_1 = data_1[['review']], data_1[['sentiment']]
X_1 = X_1[0:15]

# Load Interim CSV file and split into X and y
data_2 = csv_loader('../data/interim/' + FILE_2 + '.csv')
X_2, y_2 = data_2[['review']], data_2[['sentiment']]
X_2 = X_2[0:15]
len(X_1), len(X_2)

# Tokenize Corp 1

In [None]:
stop_words = set(stopwords.words('english'))

# Tokenize each review and lowercase everything
corp_1 = []
for i in range(len(X_1)): 
    row = X_1.iloc[i]['review']
    token_review = word_tokenize(row)
    filtered = [w.lower() for w in token_review if not w.lower() in stop_words]
    corp_1.append(filtered)
#print(f"First Corpus: {corp_1[0]}")

# Get Dictionary and TF-IDF

In [None]:
# dictionary of tokens
dictionary = gensim.corpora.Dictionary(corp_1)

# make BOW
corpus = [dictionary.doc2bow(gen_doc) for gen_doc in corp_1]

# TFIDF to downplay frequent words
tf_idf = gensim.models.TfidfModel(corpus)

# building the index
sims = gensim.similarities.Similarity('workdir/',tf_idf[corpus],
                                        num_features=len(dictionary))
print(f"\nlength of corpus: {len(corp_1)}")

# Tokenize Corp 2 and Compute Cosine Similarity

In [None]:
corp_2 = []
avg_sims = [] # array of averages
for i in range(1,len(X_2)): 
    row = X_2.iloc[i]['review']
    token_review = word_tokenize(row)
    filtered = [w.lower() for w in token_review if not w.lower() in stop_words]
    query_doc_bow = dictionary.doc2bow(filtered) # update an existing dictionary and create bag of words
    corp_2.append(filtered)
    
    # perform a similarity query against the corpus
    query_doc_tf_idf = tf_idf[query_doc_bow]
    
    doc_sim = sims[query_doc_tf_idf]
    
    # Average Similarity score
    sum_of_sims =(np.sum(doc_sim, dtype=np.float32))
    sim_ave = sum_of_sims/len(corp_1)
    avg_sims.append((sim_ave, i))
    
    #print(f"Average Similarity: {sim_ave}")
    

In [None]:
#print(f"\nAverage similarities: {avg_sims}")

# Save Similarity Scores

In [None]:
sims_only = [sim[0] for sim in avg_sims]
sims_only = pd.DataFrame(sims_only, columns=['similarity'])
#sims_only.to_csv('../data/dissimilar/'+FILE_NAME+'_sim_score.csv', index=False, header=False)

# Heap of (Similarity Score, Sentence Index)

In [None]:
pq = heapq.nsmallest(N_DIS, avg_sims, key=None) # size of heap, similarity score list to iterate through
print(f"\nPriority Q: {pq})")

In [None]:
most_dis = []
for tup in pq:
    most_dis.append((X_2.iloc[tup[1]]['review'], y_2.iloc[tup[1]]['sentiment'], tup[0], tup[1]))
print(f"\nMost Dissimilar Sentence: {most_dis[0]}")
most_dis = pd.DataFrame(most_dis, columns=['review','sentiment','cosine_score','orig_index'])
most_dis.head(2)

# Slice Top 10, 100, 1000, 10000 Most Dissimilar

In [None]:
top_10 = most_dis[0:10]
top_100 = most_dis[0:100]
#top_1000 = most_dis[0:1000]
#top_10thou = most_dis[0:10000]

In [None]:
type(top_10)
top_10.head(2)

In [None]:
top_10.to_csv('test.csv', index=False, header=False)
top_10 = pd.read_csv('test.csv', names=['review','sentiment','cosine_score','orig_index'], index=False)
top_10.head(2)

# Checking out similarity scores

Sanity Check to see what sentences are dissimilar, and that the index is correct.<br>
There appears to be a lot of Spanish in the games dataset.

## Games

In [None]:
game_dis = pd.read_csv('../data/dissimilar/games10000.csv',names=['review','sentiment','cosine_score','orig_index'])
game_dis[10:15]

In [None]:
train_g = pd.read_csv('../data/interim/games_train.csv')

In [None]:
train_g[214728:214729]

In [None]:
sew_dis = pd.read_csv('../data/dissimilar/sew10000.csv',names=['review','sentiment','cosine_score','orig_index'])
sew_dis[10:15]

## Sewing

In [None]:
train_s = pd.read_csv('../data/interim/sew_train.csv')

In [None]:
train_s[233295:233296]

# Average Similarity Scores

In [None]:
sew_dis = pd.read_csv('../data/dissimilar/sew_sim_score.csv',names=['Cosine'])

In [None]:
total = sew_dis.sum() # 6807.180589
dataset_size = len(sew_dis) # 364685
ave_sim = float(total/dataset_size)
print(f"Average Cosine Similarity in Sewing Data: {ave_sim}")

In [None]:
games_dis = pd.read_csv('../data/dissimilar/games_sim_score.csv',names=['Cosine'])

In [None]:
total = games_dis.sum() # 6588.507814
dataset_size = len(games_dis) # 350744
ave_sim = float(total/dataset_size)
print(f"Average Cosine Similarity in Games Data: {ave_sim}")

# Check Distribution of Labels

## Sewing Data

In [None]:
val_sew = pd.read_csv('../data/interim/sew_val.csv',names=['review','label'])
val_sew = val_sew[1:].reset_index().drop('index',1)
corp_len = len(val_sew)
pos = 0
neg = 0
for i in range(corp_len):
    if val_sew['label'][i] == '1':
        pos +=1
    else:
        neg +=1
print(f"Count Positive: {pos}\nCount Negative: {neg}\nRatio: {pos/corp_len*100}% Positive")

In [None]:
dis_s10 = pd.read_csv('../data/dissimilar/sew10.csv',names=['review','sentiment','cosine_score','orig_index'])
corp_len = len(dis_s10)
pos = 0
neg = 0
for i in range(corp_len):
    if dis_s10['sentiment'][i] == 1:
        pos +=1
    else:
        neg +=1
print(f"Count Positive: {pos}\nCount Negative: {neg}\nRatio: {pos/corp_len*100}% Positive")

In [None]:
dis_s100 = pd.read_csv('../data/dissimilar/sew100.csv',names=['review','sentiment','cosine_score','orig_index'])
corp_len = len(dis_s100)
pos = 0
neg = 0
for i in range(corp_len):
    if dis_s100['sentiment'][i] == 1:
        pos +=1
    else:
        neg +=1
print(f"Count Positive: {pos}\nCount Negative: {neg}\nRatio: {pos/corp_len*100}% Positive")

In [None]:
dis_s1000 = pd.read_csv('../data/dissimilar/sew1000.csv',names=['review','sentiment','cosine_score','orig_index'])
corp_len = len(dis_s1000)
pos = 0
neg = 0
for i in range(corp_len):
    if dis_s1000['sentiment'][i] == 1:
        pos +=1
    else:
        neg +=1
print(f"Count Positive: {pos}\nCount Negative: {neg}\nRatio: {pos/corp_len*100}% Positive")

In [None]:
dis_s10000 = pd.read_csv('../data/dissimilar/sew10000.csv',names=['review','sentiment','cosine_score','orig_index'])
corp_len = len(dis_s10000)
pos = 0
neg = 0
for i in range(len(dis_s10000)):
    if dis_s10000['sentiment'][i] == 1:
        pos +=1
    else:
        neg +=1
print(f"Count Positive: {pos}\nCount Negative: {neg}\nRatio: {round(pos/corp_len*100,2)}% Positive")

## Games Data

In [None]:
val_games = pd.read_csv('../data/interim/games_val.csv',names=['review','label'])
val_games = val_games[1:].reset_index().drop('index',1)
corp_len = len(val_games)
pos = 0
neg = 0
for i in range(corp_len):
    if val_sew['label'][i] == '1':
        pos +=1
    else:
        neg +=1
print(f"Count Positive: {pos}\nCount Negative: {neg}\nRatio: {round(pos/corp_len*100,2)}% Positive")

In [None]:
dis_g10 = pd.read_csv('../data/dissimilar/games10.csv',names=['review','sentiment','cosine_score','orig_index'])
corp_len = len(dis_g10)
pos = 0
neg = 0
for i in range(corp_len):
    if dis_s10['sentiment'][i] == 1:
        pos +=1
    else:
        neg +=1
print(f"Count Positive: {pos}\nCount Negative: {neg}\nRatio: {pos/corp_len*100}% Positive")

In [None]:
dis_g100 = pd.read_csv('../data/dissimilar/games100.csv',names=['review','sentiment','cosine_score','orig_index'])
corp_len = len(dis_g100)
pos = 0
neg = 0
for i in range(corp_len):
    if dis_s100['sentiment'][i] == 1:
        pos +=1
    else:
        neg +=1
print(f"Count Positive: {pos}\nCount Negative: {neg}\nRatio: {pos/corp_len*100}% Positive")

In [None]:
dis_g1000 = pd.read_csv('../data/dissimilar/games1000.csv',names=['review','sentiment','cosine_score','orig_index'])
corp_len = len(dis_g1000)
pos = 0
neg = 0
for i in range(corp_len):
    if dis_s1000['sentiment'][i] == 1:
        pos +=1
    else:
        neg +=1
print(f"Count Positive: {pos}\nCount Negative: {neg}\nRatio: {pos/corp_len*100}% Positive")

In [None]:
dis_g10000 = pd.read_csv('../data/dissimilar/games10000.csv',names=['review','sentiment','cosine_score','orig_index'])
corp_len = len(dis_g10000)
pos = 0
neg = 0
for i in range(corp_len):
    if dis_s10000['sentiment'][i] == 1:
        pos +=1
    else:
        neg +=1
print(f"Count Positive: {pos}\nCount Negative: {neg}\nRatio: {round(pos/corp_len*100,2)}% Positive")

# Old Stuff

In [None]:
# 
corp_1 = []
for i in range(5): 
    row = X_1.iloc[i]['review']
    corp_1.append(row)
    
# 
corp_2 = []
for i in range(3): 
    row = X_2.iloc[i]['review']
    corp_2.append(row)
corp_2 = corp_2[1:]
corp_2

In [None]:
count_vectorizer = TfidfVectorizer(stop_words='english')

In [None]:
sparse_matrix_1 = count_vectorizer.fit_transform(corp_1)

# OPTIONAL: Convert Sparse Matrix to Pandas Dataframe if you want to see the word frequencies.
doc_term_matrix_1 = sparse_matrix_1.todense()
df_1 = pd.DataFrame(doc_term_matrix_1, 
                  columns=count_vectorizer.get_feature_names_out())
df_1

In [None]:
sparse_matrix_2 = count_vectorizer.fit_transform(corp_2)

# OPTIONAL: Convert Sparse Matrix to Pandas Dataframe if you want to see the word frequencies.
doc_term_matrix_2 = sparse_matrix_2.todense()
df_2 = pd.DataFrame(doc_term_matrix_2, 
                  columns=count_vectorizer.get_feature_names_out())
df_2

In [None]:
# Compute Cosine Similarity
from sklearn.metrics.pairwise import cosine_similarity
print(cosine_similarity(df_1, df_2))