# Library Installation

In [0]:
!pip install rake-nltk
!pip install -U textblob
!pip install flask_jsonpify
!!pip install swifter
!python -m textblob.download_corpora
!pip install numba
import spacy.cli
spacy.cli.download("en_core_web_md")

# spacy.cli.download("en_core_web_lg")

Collecting rake-nltk
  Downloading https://files.pythonhosted.org/packages/8e/c4/b4ff57e541ac5624ad4b20b89c2bafd4e98f29fd83139f3a81858bdb3815/rake_nltk-1.0.4.tar.gz
Building wheels for collected packages: rake-nltk
  Building wheel for rake-nltk (setup.py) ... [?25l[?25hdone
  Stored in directory: /root/.cache/pip/wheels/ef/92/fc/271b3709e71a96ffe934b27818946b795ac6b9b8ff8682483f
Successfully built rake-nltk
Installing collected packages: rake-nltk
Successfully installed rake-nltk-1.0.4
Requirement already up-to-date: textblob in /usr/local/lib/python3.6/dist-packages (0.15.3)
Collecting flask_jsonpify
  Downloading https://files.pythonhosted.org/packages/60/0f/c389dea3988bffbe32c1a667989914b1cc0bce31b338c8da844d5e42b503/Flask-Jsonpify-1.5.0.tar.gz
Building wheels for collected packages: flask-jsonpify
  Building wheel for flask-jsonpify (setup.py) ... [?25l[?25hdone
  Stored in directory: /root/.cache/pip/wheels/ea/a9/40/ac47ad604861c1a40499042d30c22cdb7d1fa1abf426597788
Successfu

# Library Import & Data Loading 

In [0]:
# Libraries 
import pandas as pd
from rake_nltk import Metric, Rake
from textblob import TextBlob
from textblob.np_extractors import ConllExtractor
from textblob.sentiments import NaiveBayesAnalyzer
import numba

In [0]:
data = pd.read_csv("cleaned_bank_cust_complaints_med.csv")

In [0]:
data.head(5)

Unnamed: 0,Product,Consumer_complaint_narrative,category_id
0,"Credit reporting, credit repair services, or o...",I am a victim of identity theft. My personal i...,0
1,"Credit reporting, credit repair services, or o...","Transunion, XXXX, XXXX all show inquiries to b...",0
2,"Credit reporting, credit repair services, or o...",The Summer of XX/XX/2018 I was denied a mortga...,0
3,"Credit reporting, credit repair services, or o...","XXXX, XXXX, Experian all show inquiries to bus...",0
4,"Credit reporting, credit repair services, or o...",There are many mistakes appear in my report wi...,0


In [0]:
data_mini = data[:10000]

In [0]:
len(data_mini)

10000

In [0]:
import spacy # (I have version 2.0.18)

In [0]:
nlp = spacy.load('en')

In [0]:
@numba.jit()
def docObject(eachValue):
  return nlp(eachValue)

In [0]:
%timeit data_mini['Consumer_complaint_narrative_spacy_object'] = data_mini['Consumer_complaint_narrative'].apply(docObject)

CPU times: user 14min 41s, sys: 6min 10s, total: 20min 52s
Wall time: 10min 37s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [0]:
%time data_mini['Consumer_complaint_narrative_spacy_object'] = data_mini['Consumer_complaint_narrative'].apply(docObject)

CPU times: user 17min 13s, sys: 6min 29s, total: 23min 43s
Wall time: 12min 5s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


# Cleaning, Sentiment, Topics, NounPhrase & NounAdjectivePair

In [0]:
# Text Cleaning 
import string

def TextCleaning(eachValue):
  try:
    return eachValue.translate(str.maketrans('', '', string.punctuation)).lower()
  except AttributeError:
    return []

In [0]:
data_mini['CleanTranslate'] = data_mini.Consumer_complaint_narrative.apply(TextCleaning)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [0]:
numba.jit()
def GetSentiment(eachValue):
  try:
    return float("{0:.2f}".format(TextBlob(eachValue).sentiment.polarity))
  except TypeError:
    return []

In [0]:
def GetSentimentSubjectivity(eachValue):
  try:
    return float("{0:.2f}".format(TextBlob(eachValue).sentiment.subjectivity))
  except TypeError:
    return []

In [0]:
def GetNounPhrase(eachValue):
  try:
    return TextBlob(eachValue, np_extractor=extractor).noun_phrases
  except TypeError:
    return []
    

In [0]:
extractor = ConllExtractor()

In [0]:
%time data_mini['NounPhrase'] = data_mini.CleanTranslate.apply(GetNounPhrase)

CPU times: user 29 s, sys: 159 ms, total: 29.2 s
Wall time: 29.2 s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [0]:
%time data_mini['TextSentiment']  = data_mini.CleanTranslate.apply(GetSentiment)

CPU times: user 14.8 s, sys: 16.2 ms, total: 14.9 s
Wall time: 14.9 s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [0]:
data_mini[['CleanTranslate','TextSentiment','NounPhrase']].head(10)

Unnamed: 0,CleanTranslate,TextSentiment,NounPhrase
0,i am a victim of identity theft my personal in...,0.1,"[identity theft, personal information, xxxx xx..."
1,transunion xxxx xxxx all show inquiries to bus...,0.1,[transunion xxxx xxxx]
2,the summer of xxxx2018 i was denied a mortgage...,-0.01,"[mortgage loan due, xxxx xxxx credit card, acc..."
3,xxxx xxxx experian all show inquiries to busin...,0.1,[xxxx xxxx experian]
4,there are many mistakes appear in my report wi...,0.5,[]
5,there are many mistakes appear in my report wi...,0.5,[]
6,there are many mistakes appear in my report wi...,0.5,[]
7,there are many mistakes appear in my report wi...,0.5,[]
8,there are many mistakes appear in my report wi...,0.5,[]
9,there are many mistakes appear in my report wi...,0.5,[]


In [0]:
def ExtractTopic(eachValue):
  try:
    r = Rake(ranking_metric=Metric.DEGREE_TO_FREQUENCY_RATIO, min_length=2, max_length=4) # Uses stopwords for english from NLTK, and all puntuation characters.

    r.extract_keywords_from_text(eachValue)

    return r.get_ranked_phrases()
  except TypeError:
    return []

In [0]:
%time data_mini['Topics_DFR'] = data_mini.CleanTranslate.apply(ExtractTopic)

CPU times: user 7.05 s, sys: 256 ms, total: 7.3 s
Wall time: 7.31 s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [0]:
def ExtractTopic_degree(eachValue):
  try:
    r = Rake(ranking_metric=Metric.WORD_DEGREE, max_length=4) # Uses stopwords for english from NLTK, and all puntuation characters.

    r.extract_keywords_from_text(eachValue)

    return r.get_ranked_phrases()
  except TypeError:
    return []

In [0]:
data_mini['Topics_Deg'] = data_mini.CleanTranslate.apply(ExtractTopic_degree)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [0]:
data_mini[['CleanTranslate', 'NounPhrase', 'Topics_DFR', 'Topics_Deg']].sample(5)

Unnamed: 0,CleanTranslate,NounPhrase,Topics_DFR,Topics_Deg
3820,this is regarding the purchase of item on xxxx...,"[xxxx xxxx, xxxx xxxx consumer credit card, xx...","[many sales associates would, third page 3rd p...","[xxxx xxxx credit card, xxxx xxxx charge card,..."
2712,to whom it may concern i ahve received a lette...,"[xxxx xxxx xxxx xxxx, conc ern due process, ko...","[conc ern due process, wells fargo informing, ...","[conc ern due process, wells fargo informing, ..."
2082,today xxxx2019 i tried to call my bank bank o...,"[today xxxx2019, bank bank, new card, deposit ...","[xxxx years old, put xxxx dollars, least 7 tim...","[bank bank, xxxx years old, put xxxx dollars, ..."
7054,i made a purchase with my credit union credit ...,"[credit union credit card, xxxx xxxx xxxx, two...","[two adult ski packages, xxxx approximately on...","[discover card discover stated, xxxx xxxx xxxx..."
6212,i applied for a chase united explorer card in ...,"[explorer card, xxxx xxxx bonus miles, credit ...","[money within 3 months, chase united explorer ...","[since called severa times, chase united explo..."


In [0]:
data_mini.CleanTranslate.iloc[239]

'with midland credit that is a third party company who purchases debits owed that has no contract with me i had a card with xxxx  and there was charges that i hadnt made after reporting this directly to the company i now have xxxx off my credit due to it was resolved i closed my account and some how midland credit purchased this old account and i have no contract with this company to even collect this amount i closed down my account after the fraud purchases where resolved now i have them threaten to garnish wages and they dont have a contract with myseld which happens to xxxx xxxx xxxx i would like for them to stop reporting on my credit as the account is closed due to my request and the charges have being resolved please provide me with a contract that i have with you as you are just a third party who purchased this old account that has being closed'

In [0]:
numba.jit()
def GetNounAdjectivePair(eachValue):
  try:
#     doc = nlp(eachValue)
    noun_adj_pairs = []
    for i,token in enumerate(eachValue):
        if token.pos_ not in ('NOUN','PROPN','VERB'):
            continue
        for j in range(i+1,len(eachValue)):
            if doc[j].pos_ == 'ADJ':
                noun_adj_pairs.append((token.text,doc[j].text))
#                 print(type(token.text))
#                 print(type(doc[j].text))
                break
    return noun_adj_pairs
  except TypeError:
    return []

In [0]:
#nlp = spacy.load("en_core_web_lg")
%time data_mini['NounAdjectivePair'] = data_mini.Consumer_complaint_narrative_spacy_object.apply(GetNounAdjectivePair)

CPU times: user 15min 44s, sys: 6min 1s, total: 21min 46s
Wall time: 11min 6s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [0]:
type(data_mini.iloc[0]['Consumer_complaint_narrative_spacy_object'])

spacy.tokens.doc.Doc

In [0]:
data_mini.head(5)

Unnamed: 0,Product,Consumer_complaint_narrative,category_id,Consumer_complaint_narrative_spacy_object,CleanTranslate,NounPhrase,TextSentiment,Topics_DFR,Topics_Deg,NounAdjectivePair
0,"Credit reporting, credit repair services, or o...",I am a victim of identity theft. My personal i...,0,"(I, am, a, victim, of, identity, theft, ., My,...",i am a victim of identity theft my personal in...,"[identity theft, personal information, xxxx xx...",0.1,"[identity 3 section 605b, fair credit reportin...","[identity 3 section 605b, fair credit reportin...","[(am, my), (victim, my), (identity, my), (thef..."
1,"Credit reporting, credit repair services, or o...","Transunion, XXXX, XXXX all show inquiries to b...",0,"(Transunion, ,, XXXX, ,, XXXX, all, show, inqu...",transunion xxxx xxxx all show inquiries to bus...,[transunion xxxx xxxx],0.1,"[transunion xxxx xxxx, never given permission,...","[transunion xxxx xxxx, list hard inquiries, ne...","[(transunion, that), (xxxx, that), (xxxx, that..."
2,"Credit reporting, credit repair services, or o...",The Summer of XX/XX/2018 I was denied a mortga...,0,"(The, Summer, of, XX, /, XX/2018, I, was, deni...",the summer of xxxx2018 i was denied a mortgage...,"[mortgage loan due, xxxx xxxx credit card, acc...",-0.01,"[requested xxxx xxxx still, certified letters ...","[xxxx xxxx credit card, requested xxxx xxxx st...","[(summer, xxxx), (xxxx2018, xxxx), (was, xxxx)..."
3,"Credit reporting, credit repair services, or o...","XXXX, XXXX, Experian all show inquiries to bus...",0,"(XXXX, ,, XXXX, ,, Experian, all, show, inquir...",xxxx xxxx experian all show inquiries to busin...,[xxxx xxxx experian],0.1,"[xxxx xxxx experian, never given permission, l...","[xxxx xxxx experian, list hard inquiries, neve...","[(xxxx, experian), (show, that), (inquiries, t..."
4,"Credit reporting, credit repair services, or o...",There are many mistakes appear in my report wi...,0,"(There, are, many, mistakes, appear, in, my, r...",there are many mistakes appear in my report wi...,[],0.5,"[many mistakes appear, report without]","[many mistakes appear, report without, underst...","[(are, many), (mistakes, my), (appear, my), (r..."


In [0]:
data.to_csv("SurveyTextData_Final.csv")

# Detail Analysis on Topics & Entities 

## Unique Topics - Frequency, Strength & Similarity

###Frequency & Strength

In [0]:
def getUniqueTopicsFrequency(DfColumn_Topics, DfColumn_NP):
  AllTopicList = [item for sublist in list(DfColumn_Topics) for item in sublist]
  AllNPList = [item for sublist in list(DfColumn_NP) for item in sublist]
  UniqueTopicList = list(set(AllTopicList))
  UniqueNPList = list(set(AllNPList))
  CombinedList = AllTopicList + AllNPList
  TopicFrequency = collections.Counter(CombinedList)
  print(TopicFrequency)
  df = pd.DataFrame.from_dict(TopicFrequency, orient='index').reset_index()
#   df['Topic'] = df.index
  df.columns = df.columns.map(str)
  df['Strength%'] = (df['0']/len(CombinedList))*100
  df.rename(columns = {'0':'TotalCountAllAnswer', 'index':'ImportantTopic'}, inplace=True)
#   df['Strength%'] = (df[0]/TotalUniqueTopic)*100   ## Divide by total Answers 
  return df.sort_values(by=['TotalCountAllAnswer'], ascending=False),UniqueTopicList 

In [0]:
import collections
ImportantTopicsDF, TopicList = getUniqueTopicsFrequency(data['Topics_DFR'], data['NounPhrase'] )

Counter({'spicy taste good': 135, 'savory taste good': 114, 'sour taste good': 93, 'sweetness taste good': 79, 'salty taste good': 58, 'spicy taste': 53, 'garlic taste good': 35, 'viscositys good': 34, 'garlic taste': 33, 'salty taste': 28, 'spicy taste strong': 28, 'chilli smells strong': 26, 'sour taste': 24, 'textures soft': 19, 'real chilli': 19, 'garlic smells': 16, 'chilli smells good': 16, 'garlic smell': 12, 'colour need': 12, 'garlic smells good': 11, 'garlic smells strong': 9, 'savory taste': 9, 'little bit': 9, 'garlic taste strong': 8, 'sauce need': 7, 'savoury taste': 7, 'garlic smell strong': 6, 'chilli smell fresh': 6, 'light red colour': 6, 'red colour': 6, 'chilli smells strong good': 6, 'chilli taste strong': 6, 'chilli taste real': 6, 'viscosity need': 6, 'sweetness need': 6, 'little bit more': 6, 'sweetness taste strong': 5, 'chilli sauce': 5, 'light red': 5, 'sour sweet taste good': 5, 'colour needs': 5, 'viscosity isnt good enough': 5, 'spicy tatse good': 4, 'sauc

In [0]:
ImportantTopicsDF.head(10)

Unnamed: 0,ImportantTopic,TotalCountAllAnswer,Strength%
22,spicy taste good,135,8.29748
17,savory taste good,114,7.006761
11,sour taste good,93,5.716042
10,sweetness taste good,79,4.855562
34,salty taste good,58,3.564843
3,spicy taste,53,3.257529
18,garlic taste good,35,2.151199
14,viscositys good,34,2.089736
88,garlic taste,33,2.028273
5,salty taste,28,1.720959


### Noun-Adjective Break ---- Entity & Their Attributes 

In [0]:
def getNoun_Entity(eachValue):
  try:
    doc = nlp(eachValue)
    Noun = []
#     for chunk in doc.noun_chunks:
#       Noun.append(chunk.text)
    for token in doc:
      if token.pos_ == 'NOUN':
        Noun.append(token.text)
    return Noun
  except TypeError:
    return []
    
  

def getAllAdjective(eachValue):
  try:
    doc = nlp(eachValue)
    Noun = []
    for token in doc:
      if token.pos_ == 'ADJ':
        Noun.append(token.text)
    return Noun
  except TypeError:
    return []

In [0]:
nlp = spacy.load("en_core_web_sm")
ImportantTopicsDF['ImportantEntity'] = ImportantTopicsDF['ImportantTopic'].apply(getNoun_Entity)
ImportantTopicsDF['EntityAttributes'] = ImportantTopicsDF['ImportantTopic'].apply(getAllAdjective)
ImportantTopicsDF['TopicSentiment'] = ImportantTopicsDF['ImportantTopic'].apply(GetSentiment)
ImportantTopicsDF['TopicSubjectivity'] = ImportantTopicsDF['ImportantTopic'].apply(GetSentimentSubjectivity)
# ImportantTopicsDF['POSTAG'] = ImportantTopicsDF['ImportantTopic'].apply(getPOSTAG)

### Similarity (Needs Optimization. very Slow)

In [0]:
global TopicList
global ImportantTopicsDF

In [0]:

numba.jit()
def getSimilarity_topic(eachValue):
  SimilarityDict = {}
  SimilarityList = []
  for eachTopic in TopicList:
    try:
      SimilarityValue = float("{0:.2f}".format(nlp(eachValue).similarity(nlp(eachTopic))))
      if SimilarityValue > 0.90 and SimilarityValue < 1:
        StrengthValue = float("{0:.2f}".format(ImportantTopicsDF.loc[ImportantTopicsDF['ImportantTopic'] == eachTopic]['Strength%'].values[0]))
        SentimentValue = float("{0:.2f}".format(TextBlob(eachTopic).sentiment.polarity))
        SimilarityList.append([(eachTopic,SimilarityValue), ('Strength', StrengthValue ), ('sentiment', SentimentValue) ])
    except:
      SimilarityList =[]
  SimilarityDict[eachValue] = SimilarityList
  print (SimilarityDict)
  return SimilarityDict

# def apply_myfunc_to_DF(df): return df.apply((lambda row: getSimilarity_topic(*row)), axis=1)

In [0]:
import numpy as np
# from multiprocessing import cpu_count, Parallel
# from joblib import Parallel, delayed
from multiprocessing import Pool
import scipy.sparse as sp
import multiprocessing
import swifter

num_partitions = 10
num_cores = multiprocessing.cpu_count()

 
cores = cpu_count() #Number of CPU cores on your system
partitions = cores #Define as many partitions as you want

@numba.jit()
def parallelize(data, func):
#     a,b,c,d,e,f,g,h,i,j = np.array_split(data, num_partitions)
    a = np.array_split(data, num_partitions)
    pool = Pool(num_cores)
#     df = pd.concat(pool.map(func, [a,b,c,d,e,f,g,h,i,j ]))
    df = sp.vstack(pool.map(func, a), format='csr')
    pool.close()
    pool.join()
    return df
  
numba.jit()
def ParallelizeDFOperation(ImportantTopicsDF):
  ImportantTopicsDF['TopicSimilarity'] = ImportantTopicsDF['ImportantTopic'].swifter.apply(getSimilarity_topic)
  return ImportantTopicsDF
  
data = parallelize(ImportantTopicsDF, ParallelizeDFOperation)

{'real chilli taste strong': [[('real chilli taste good', 0.97), ('Strength', 0.4), ('sentiment', 0.45)]]}
{'spicy taste good': [[('spicy taste strong', 0.95), ('Strength', 2.0), ('sentiment', 0.43)], [('spicy sauce taste good', 0.93), ('Strength', 0.4), ('sentiment', 0.7)], [('viscosity taste good', 0.91), ('Strength', 0.2), ('sentiment', 0.7)], [('savory taste good', 0.93), ('Strength', 8.42), ('sentiment', 0.7)], [('spicy sauce taste strong', 0.91), ('Strength', 0.4), ('sentiment', 0.43)], [('salty taste good', 0.91), ('Strength', 4.81), ('sentiment', 0.7)], [('garlic taste good', 0.92), ('Strength', 2.0), ('sentiment', 0.7)], [('spicy taste delicious', 0.94), ('Strength', 0.4), ('sentiment', 1.0)], [('spicy taste good fresh', 0.93), ('Strength', 0.4), ('sentiment', 0.5)], [('spicy taste strong good', 0.93), ('Strength', 0.4), ('sentiment', 0.57)], [('swetness taste good', 0.91), ('Strength', 0.4), ('sentiment', 0.7)]]}
{'real chilli taste strong': [[('real chilli taste good', 0.97)

HBox(children=(IntProgress(value=0, description='Pandas Apply', max=16, style=ProgressStyle(description_width=…

HBox(children=(IntProgress(value=0, description='Pandas Apply', max=15, style=ProgressStyle(description_width=…

{'spicy taste good': [[('spicy taste strong', 0.95), ('Strength', 2.0), ('sentiment', 0.43)], [('spicy sauce taste good', 0.93), ('Strength', 0.4), ('sentiment', 0.7)], [('viscosity taste good', 0.91), ('Strength', 0.2), ('sentiment', 0.7)], [('savory taste good', 0.93), ('Strength', 8.42), ('sentiment', 0.7)], [('spicy sauce taste strong', 0.91), ('Strength', 0.4), ('sentiment', 0.43)], [('salty taste good', 0.91), ('Strength', 4.81), ('sentiment', 0.7)], [('garlic taste good', 0.92), ('Strength', 2.0), ('sentiment', 0.7)], [('spicy taste delicious', 0.94), ('Strength', 0.4), ('sentiment', 1.0)], [('spicy taste good fresh', 0.93), ('Strength', 0.4), ('sentiment', 0.5)], [('spicy taste strong good', 0.93), ('Strength', 0.4), ('sentiment', 0.57)], [('swetness taste good', 0.91), ('Strength', 0.4), ('sentiment', 0.7)]]}
{'real chilli taste strong': [[('real chilli taste good', 0.97), ('Strength', 0.4), ('sentiment', 0.45)]]}
{'sour taste good': [[('garlic taste good', 0.92), ('Strength',

HBox(children=(IntProgress(value=0, description='Pandas Apply', max=15, style=ProgressStyle(description_width=…

{'sauce taste strong': [[('spicy sauce taste strong', 0.92), ('Strength', 0.4), ('sentiment', 0.43)]]}
{'really good': []}
{'sauce taste strong': [[('spicy sauce taste strong', 0.92), ('Strength', 0.4), ('sentiment', 0.43)]]}


HBox(children=(IntProgress(value=0, description='Pandas Apply', max=15, style=ProgressStyle(description_width=…

{'swetness taste good': [[('sweetness taste good soft', 0.93), ('Strength', 0.4), ('sentiment', 0.4)], [('viscosity taste good', 0.95), ('Strength', 0.2), ('sentiment', 0.7)], [('savory taste good', 0.94), ('Strength', 8.42), ('sentiment', 0.7)], [('savory taste good overall', 0.91), ('Strength', 0.4), ('sentiment', 0.35)], [('spicy taste good', 0.91), ('Strength', 12.22), ('sentiment', 0.7)], [('sweetness taste good', 0.95), ('Strength', 7.01), ('sentiment', 0.7)], [('garlic taste good', 0.93), ('Strength', 2.0), ('sentiment', 0.7)], [('sweettness taste good', 0.93), ('Strength', 0.2), ('sentiment', 0.7)], [('saty taste good', 0.92), ('Strength', 0.4), ('sentiment', 0.7)], [('savoryy taste good', 0.92), ('Strength', 0.4), ('sentiment', 0.7)]]}
{'sauce taste strong': [[('spicy sauce taste strong', 0.92), ('Strength', 0.4), ('sentiment', 0.43)]]}
{'garlic taste good delicious': [[('savory taste good soft', 0.94), ('Strength', 0.2), ('sentiment', 0.4)], [('garlic taste strong savory', 0.

HBox(children=(IntProgress(value=0, description='Pandas Apply', max=15, style=ProgressStyle(description_width=…

{'appearances colours bright': [[('sauces colours bright', 0.94), ('Strength', 0.2), ('sentiment', 0.7)]]}
{'real chilli taste good': [[('real chilli taste strong', 0.97), ('Strength', 0.4), ('sentiment', 0.32)]]}
{'appearances colours bright': [[('sauces colours bright', 0.94), ('Strength', 0.2), ('sentiment', 0.7)]]}


HBox(children=(IntProgress(value=0, description='Pandas Apply', max=15, style=ProgressStyle(description_width=…

{'chilli smell strong': [[('chilli smell strong good', 0.91), ('Strength', 0.4), ('sentiment', 0.57)]]}
{'appearances colours bright': [[('sauces colours bright', 0.94), ('Strength', 0.2), ('sentiment', 0.7)]]}
{'good viscosity taste good': []}
{'realthe taste': []}
{'good sweet savory sour salty taste good': []}
{'good red colour': []}
{'fresh sweetness taste good': []}
{'theh light red appearance colours attractive': []}
{'good garlic taste goodthe chilli': []}
{'spicy taste strong garlic taste': []}
{'soft light appearance colours good': []}
{'strong light red colour': []}
{'good savory taste good': []}
{'attractive sour salty sweetness taste good': []}
{'garlic taste savory': [[('garlic taste strong savory', 0.92), ('Strength', 0.2), ('sentiment', 0.43)], [('garlic taste', 0.92), ('Strength', 0.4), ('sentiment', 0.0)]]}
{'colour isnt pale': []}
{'sauces colours light red': []}
{'lumpy good': []}

{'sweetness savory': [[('savory sweetness', 0.95), ('Strength', 0.4), ('sentiment', 0.

HBox(children=(IntProgress(value=0, description='Pandas Apply', max=15, style=ProgressStyle(description_width=…

{'goodthe viscositys good': []}
{'salty sweet': [[('salty sour sweet', 0.94), ('Strength', 0.2), ('sentiment', 0.1)]]}
{'viscosityys good': []}
{'sauce looks bright': [[('sauce looks modern', 0.93), ('Strength', 0.2), ('sentiment', 0.2)]]}
{'sauces colours bright': [[('appearances colours bright', 0.94), ('Strength', 0.2), ('sentiment', 0.7)]]}
{'sauce isnt watery good': []}
{'salty sour sweetness': []}
{'tastes unique': []}
{'appearance colours light red': [[('light appearance colours good', 0.91), ('Strength', 0.2), ('sentiment', 0.55)]]}
{'looks good': []}
{'good enough doesnt make': []}
{'nauseous garlic smell': []}

{'spicy savory taste good soft': []}
{'spicy savory taste good soft': []}
{'spicy savory taste good soft': []}


HBox(children=(IntProgress(value=0, description='Pandas Apply', max=15, style=ProgressStyle(description_width=…

{'spicy savory taste good soft': []}
{'sauces colours bright tthe chilli': []}
{'delicious salty taste good': []}
{'good light red colour': []}
{'delicious spicy taste good': []}
{'garlic seasoning taste good tthe chill': []}
{'delicious salty sour sweetness': []}
{'chilli sauce': []}
{'light red appearance colours attractive': []}
{'good salty taste good': []}
{'fresh sweettness taste good': []}
{'light red appearance colours good': []}
{'spicy sour taste good': []}
{'strong savory taste good': []}
{'good garlic taste strong savory': []}

{'viscosity taste good': [[('savory taste good', 0.94), ('Strength', 8.42), ('sentiment', 0.7)], [('savory taste good overall', 0.91), ('Strength', 0.4), ('sentiment', 0.35)], [('spicy taste good', 0.91), ('Strength', 12.22), ('sentiment', 0.7)], [('garlic taste good', 0.92), ('Strength', 2.0), ('sentiment', 0.7)], [('viscosity taste good soft', 0.94), ('Strength', 0.4), ('sentiment', 0.4)], [('sweettness taste good', 0.95), ('Strength', 0.2), ('sent

HBox(children=(IntProgress(value=0, description='Pandas Apply', max=15, style=ProgressStyle(description_width=…

{'still strong': []}
{'sauces attractive': []}
{'garlic taste goodthe chilli': []}
{'colours nice': []}
{'sauces bright make': []}
{'sauce smells good good': [[('sauce smells good', 0.93), ('Strength', 0.2), ('sentiment', 0.7)]]}
{'chilli smells fresh': [[('chilli smells natural', 0.92), ('Strength', 0.2), ('sentiment', 0.1)]]}
{'garlic taste strong savory': [[('savory taste strong', 0.91), ('Strength', 0.4), ('sentiment', 0.43)], [('savory taste good soft', 0.93), ('Strength', 0.2), ('sentiment', 0.4)], [('garlic taste savory', 0.92), ('Strength', 0.4), ('sentiment', 0.0)], [('garlic taste good delicious', 0.91), ('Strength', 0.4), ('sentiment', 0.85)], [('garlic taste strong', 0.95), ('Strength', 0.8), ('sentiment', 0.43)], [('garlic taste good', 0.91), ('Strength', 2.0), ('sentiment', 0.7)], [('garlic smell strong', 0.91), ('Strength', 1.2), ('sentiment', 0.43)]]}
{'salty sour sweet': [[('salty sweet', 0.94), ('Strength', 0.2), ('sentiment', 0.35)], [('sour salty', 0.91), ('Strength

HBox(children=(IntProgress(value=0, description='Pandas Apply', max=15, style=ProgressStyle(description_width=…

{'bit strong': []}
{'sweettness taste good': [[('viscosity taste good', 0.95), ('Strength', 0.2), ('sentiment', 0.7)], [('savory taste good', 0.93), ('Strength', 8.42), ('sentiment', 0.7)], [('garlic taste good', 0.91), ('Strength', 2.0), ('sentiment', 0.7)], [('viscosity taste good soft', 0.91), ('Strength', 0.4), ('sentiment', 0.4)], [('saty taste good', 0.92), ('Strength', 0.4), ('sentiment', 0.7)], [('swetness taste good', 0.93), ('Strength', 0.4), ('sentiment', 0.7)]]}
{'sauce looks modern': [[('sauce looks bright', 0.93), ('Strength', 0.2), ('sentiment', 0.7)]]}
{'viscositys thick': []}
{'textures softt': []}
{'chilli smells good strong': [[('chilli smells strong good', 0.96), ('Strength', 1.0), ('sentiment', 0.57)]]}
{'colours thick': []}
{'savoury taste us enough': []}
{'sauce smells good': [[('sauce smells good good', 0.93), ('Strength', 0.2), ('sentiment', 0.7)]]}
{'savory taste good soft': [[('savory taste strong', 0.91), ('Strength', 0.4), ('sentiment', 0.43)], [('sweetness

ValueError: ignored

In [0]:
import pickle
TopicListFN = "TopicListDump.dat"
with open(TopicListFN, "wb") as f:
    pickle.dump(TopicList, f)

In [0]:
data.head(5)

### Similarity of High Strength Topic - Given a Topic ; Return all Similar Topics or  Threshold for Highest Strength Topics 

####Threshold Logic

#####Pickel Dump of TopicList

In [0]:
import pickle
filehandler = open("TopicList.dat","wb")
pickle.dump(TopicList,filehandler)

#####Logic

In [0]:
global TopicList
global ImportantTopicsDF


def getSimilarity_topic(eachValue):
  SimilarityDict = {}
  SimilarityList = []
  SortedNumberValue = sorted( [(x,i) for (i,x) in enumerate(ImportantTopicsDF['Strength%'])], reverse=True )[:SortNumber][SortNumber-1][0]
  if ImportantTopicsDF['Strength%'][(ImportantTopicsDF['ImportantTopic'] == eachValue)].values[0] >= SortedNumberValue:
#     print ("Inside Important Topic", eachValue)
    for eachTopic in TopicList:
      SimilarityValue = float("{0:.2f}".format(nlp(eachValue).similarity(nlp(eachTopic))))
      if SimilarityValue > 0.90 and SimilarityValue < 1:
        StrengthValue = float("{0:.2f}".format(ImportantTopicsDF.loc[ImportantTopicsDF['ImportantTopic'] == eachTopic]['Strength%'].values[0]))
        SentimentValue = float("{0:.2f}".format(TextBlob(eachTopic).sentiment.polarity))
        SimilarityList.append([(eachTopic,SimilarityValue), ('Strength', StrengthValue ), ('sentiment', SentimentValue) ])
#         print (SimilarityList)
  SimilarityDict[eachValue] = SimilarityList
#   print (SimilarityDict)
  return SimilarityDict

import numpy as np
from multiprocessing import Pool
import scipy.sparse as sp
import multiprocessing
import swifter

num_partitions = 5
num_cores = multiprocessing.cpu_count()

 
def parallelize_dataframe(df, func):
    df_split = np.array_split(df, num_partitions)
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df
  
def ParallelizeDFOperation(ImportantTopicsDF):
  global SortNumber
  SortNumber = 10
  ImportantTopicsDF['TopicSimilarity'] = ImportantTopicsDF['ImportantTopic'].apply(getSimilarity_topic)
  return ImportantTopicsDF
  
ImportantTopicsDF = parallelize_dataframe(ImportantTopicsDF, ParallelizeDFOperation)


In [0]:
ImportantTopicsDF.to_csv("Survey_Similarity.csv")

####SingleTopic Logic  - Not Complete

# Production Service  -> Important Topic Frequency, Strength, Sentiment & Entities -> Working

## Server

In [0]:
import pandas as pd
df = pd.read_csv("AmazonReviewBig.csv")

In [0]:
df.head()

Unnamed: 0,0,Translate
0,1,Very nice Items. Fits my needs and very fast s...
1,2,very good phone
2,3,thank you good
3,4,Very good phones worth what I paid
4,5,I owned a galaxy before this and I miss featur...


In [0]:
%%writefile server.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import socket
print(socket.gethostbyname(socket.getfqdn(socket.gethostname())))

import collections
from flask import Flask
import pandas as pd
from rake_nltk import Metric, Rake
import spacy
from textblob import TextBlob
from flask import Flask, request, render_template
import pandas as pd
import string
import json
from textblob.np_extractors import ConllExtractor
from flask_jsonpify import jsonpify

app = Flask(__name__)

def readData(path):
    # Read NPZ and Transform to Datafram. Single Question Responses. Sample is on 51 Records
    return pd.read_csv(path)


def GetSentiment(eachValue):
  try:
    return float("{0:.2f}".format(TextBlob(eachValue).sentiment.polarity))
  except TypeError:
    return []

def TextCleaning(eachValue):
  try:
    return eachValue.translate(str.maketrans('', '', string.punctuation)).lower()
  except AttributeError:
    return "Dummy"
 

def GetSentimentSubjectivity(eachValue):
  try:
    return float("{0:.2f}".format(TextBlob(eachValue).sentiment.subjectivity))
  except TypeError:
    return []

def GetNounPhrase(eachValue):
  try:
    return TextBlob(eachValue, np_extractor=extractor).noun_phrases
  except TypeError:
    return []

def ExtractTopic(eachValue):
  try:
    r = Rake(ranking_metric=Metric.DEGREE_TO_FREQUENCY_RATIO, min_length=2, max_length=4) # Uses stopwords for english from NLTK, and all puntuation characters.

    r.extract_keywords_from_text(eachValue)

    return r.get_ranked_phrases()
  except TypeError:
    return []


def GetNounAdjectivePair(eachValue):
    try:
        doc = nlp(eachValue)
        noun_adj_pairs = []
        for i,token in enumerate(doc):
            if token.pos_ not in ('NOUN','PROPN','VERB'):
                continue
            for j in range(i+1,len(doc)):
                if doc[j].pos_ == 'ADJ':
                    noun_adj_pairs.append((token.text,doc[j].text))
                    break
        return noun_adj_pairs
    except TypeError:
        return []


def getSentTopicNP(dataframe):
    dataframe['CleanTranslate'] = dataframe.Translate.apply(TextCleaning)
    dataframe['NounPhrase'] = dataframe.CleanTranslate.apply(GetNounPhrase)
    dataframe['TextSentiment']  = dataframe.CleanTranslate.apply(GetSentiment)
    dataframe['Topics_DFR'] = dataframe.CleanTranslate.apply(ExtractTopic)
    dataframe['NounAdjectivePair'] = dataframe.CleanTranslate.apply(GetNounAdjectivePair)
    return dataframe
  
  
def getFinalTopicDf(ImportantTopicsDF):
  ImportantTopicsDF['ImportantEntity'] = ImportantTopicsDF['ImportantTopic'].apply(getNoun_Entity)
  ImportantTopicsDF['ImportantVerbEntity'] = ImportantTopicsDF['ImportantTopic'].apply(getVerb_Entity)
  ImportantTopicsDF['EntityAttributes'] = ImportantTopicsDF['ImportantTopic'].apply(getAllAdjective)
  ImportantTopicsDF['TopicSentiment'] = ImportantTopicsDF['ImportantTopic'].apply(GetSentiment)
  ImportantTopicsDF['TopicSubjectivity'] = ImportantTopicsDF['ImportantTopic'].apply(GetSentimentSubjectivity)
#   ImportantTopicsDF['TextSentiment']  = ImportantTopicsDF.Translate.apply(GetSentiment)
  return ImportantTopicsDF
  
def getUniqueTopicsFrequency(DfColumn_Topics, DfColumn_NP):
  AllTopicList = [item for sublist in list(DfColumn_Topics) for item in sublist]
  AllNPList = [item for sublist in list(DfColumn_NP) for item in sublist]
  UniqueTopicList = list(set(AllTopicList))
  UniqueNPList = list(set(AllNPList))
  CombinedList = AllTopicList + AllNPList
  TopicFrequency = collections.Counter(CombinedList)
#   print(TopicFrequency)
  df = pd.DataFrame.from_dict(TopicFrequency, orient='index').reset_index()
#   df['Topic'] = df.index
  df.columns = df.columns.map(str)
  df['Strength%'] = (df['0']/len(CombinedList))*100
  df.rename(columns = {'0':'TotalCountAllAnswer', 'index':'ImportantTopic'}, inplace=True)
#   df['Strength%'] = (df[0]/TotalUniqueTopic)*100   ## Divide by total Answers 
  return df.sort_values(by=['TotalCountAllAnswer'], ascending=False),UniqueTopicList 

def getVerb_Entity(eachValue):
  try:
    doc = nlp(eachValue)
    Noun = []
#     for chunk in doc.noun_chunks:
#       Noun.append(chunk.text)
    for token in doc:
      if token.pos_ == 'VERB':
        Noun.append(token.text)
    return Noun
  except TypeError:
    return []
    
def getNoun_Entity(eachValue):
  try:
    doc = nlp(eachValue)
    Noun = []
#     for chunk in doc.noun_chunks:
#       Noun.append(chunk.text)
    for token in doc:
      if token.pos_ == 'NOUN':
        Noun.append(token.text)
    return Noun
  except TypeError:
    return []
  

def getAllAdjective(eachValue):
  try:
    doc = nlp(eachValue)
    Noun = []
    for token in doc:
      if token.pos_ == 'ADJ':
        Noun.append(token.text)
    return Noun
  except TypeError:
    return []


def loadSpacyModel(ModelName):
    return spacy.load(ModelName)

def Dataframe2JsonObject(dataframe):
#     df_list = dataframe.values.tolist()
#     JSONP_data = jsonpify(df_list)
#     return JSONP_data
    return dataframe.to_json(orient='records')

@app.route('/')
def SurveyAnalysis():
    global extractor
    extractor = ConllExtractor()
    ModelName = 'en_core_web_lg'
    global nlp
    nlp = loadSpacyModel(ModelName)
#     data = readData('SurveyTextData_subset.csv')
    data = readData('SurveyTextData_subset.csv')
    getSentTopicNPDF = getSentTopicNP(data)
    ImportantTopicsDF, TopicList = getUniqueTopicsFrequency(data['Topics_DFR'], data['NounPhrase'] )
    
    
    return Dataframe2JsonObject(getFinalTopicDf(ImportantTopicsDF))

if __name__ == '__main__':
    # nlp = spacy.load("en_core_web_sm")
    app.run(threaded=True)

# import threading
# threading.Thread(target=app.run, kwargs={'host':'0.0.0.0','port':80}).start() 

Overwriting server.py


##MultiProcess Server Run

In [3]:
import os
import subprocess
import sys
import time
import socket
# Start a subprocess that runs the Flask server
p = subprocess.Popen([sys.executable, "-m", "flask", "run"], env=dict(**os.environ, FLASK_APP="server.py"), stdout=subprocess.PIPE, stderr=subprocess.PIPE)

# Start two subthreads that forward the output from the Flask server to the output of the Jupyter notebook
def forward(i, o):
    while p.poll() is None:
        l = i.readline().decode('utf-8')
        if l:
            o.write("[SERVER] " + l)

import threading
threading.Thread(target=forward, args=(p.stdout, sys.stdout)).start()
threading.Thread(target=forward, args=(p.stderr, sys.stderr)).start()
import socket
print(socket.gethostbyname(socket.getfqdn(socket.gethostname())))
# sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
# Let's give the server a bit of time to make sure it has started
time.sleep(2)

172.28.0.2
[SERVER]  * Serving Flask app "server.py"
[SERVER]  * Environment: production
[SERVER]    Use a production WSGI server instead.
[SERVER]  * Debug mode: off


[SERVER] Usage: python -m flask run [OPTIONS]
[SERVER] 
[SERVER] Error: Could not import "server".


##Request

In [0]:
import requests
# r = requests.get("http://172.28.0.2/")
r = requests.get(" http://127.0.0.1:5000/")
print(r.status_code)
print(r.encoding)
print(r.apparent_encoding)
print(r.text)

200
utf-8
ascii
[{"ImportantTopic":"spicy taste good","TotalCountAllAnswer":61,"Strength%":12.2244488978,"ImportantEntity":["taste"],"ImportantVerbEntity":[],"EntityAttributes":["spicy","good"],"TopicSentiment":0.7,"TopicSubjectivity":0.6},{"ImportantTopic":"sour taste good","TotalCountAllAnswer":43,"Strength%":8.6172344689,"ImportantEntity":["taste"],"ImportantVerbEntity":[],"EntityAttributes":["sour","good"],"TopicSentiment":0.27,"TopicSubjectivity":0.35},{"ImportantTopic":"savory taste good","TotalCountAllAnswer":42,"Strength%":8.4168336673,"ImportantEntity":["taste"],"ImportantVerbEntity":[],"EntityAttributes":["savory","good"],"TopicSentiment":0.7,"TopicSubjectivity":0.6},{"ImportantTopic":"sweetness taste good","TotalCountAllAnswer":35,"Strength%":7.0140280561,"ImportantEntity":["sweetness"],"ImportantVerbEntity":["taste"],"EntityAttributes":["good"],"TopicSentiment":0.7,"TopicSubjectivity":0.6},{"ImportantTopic":"salty taste good","TotalCountAllAnswer":24,"Strength%":4.809619238

[SERVER] 127.0.0.1 - - [25/Apr/2019 10:53:55] "[37mGET / HTTP/1.1[0m" 200 -


## Request 2 Dataframe

In [0]:
import pandas as pd
pd.DataFrame.from_dict(eval(r.text), orient='columns').head()

Unnamed: 0,EntityAttributes,ImportantEntity,ImportantTopic,ImportantVerbEntity,Strength%,TopicSentiment,TopicSubjectivity,TotalCountAllAnswer
0,"[spicy, good]",[taste],spicy taste good,[],12.224449,0.7,0.6,61
1,"[sour, good]",[taste],sour taste good,[],8.617234,0.27,0.35,43
2,"[savory, good]",[taste],savory taste good,[],8.416834,0.7,0.6,42
3,[good],[sweetness],sweetness taste good,[taste],7.014028,0.7,0.6,35
4,"[salty, good]",[],salty taste good,[taste],4.809619,0.7,0.6,24


In [2]:
!ps -fA | grep python

root          31      11  2 18:29 ?        00:00:01 /usr/bin/python2 /usr/local/bin/jupyter-notebook --ip="172.28.0.2" --port=9000 --FileContentsManager.root_dir="/" --MappingKernelManager.root_dir="/content"
root         144      31 11 18:29 ?        00:00:01 /usr/bin/python3 -m ipykernel_launcher -f /root/.local/share/jupyter/runtime/kernel-1d33ff92-aadf-4b2c-92c8-3a2b555c2a3b.json
root         166     144  0 18:30 ?        00:00:00 /bin/bash -c ps -fA | grep python


In [1]:
!kill 2251

/bin/bash: line 0: kill: (2251) - No such process


In [0]:
import pandas as pd
df = pd.read_csv("AmazonReviewBig_Test.csv")

In [0]:
import spacy
nlp = spacy.load("en_core_web_sm")
doc1 = nlp("good")
doc2 =nlp("nice")
doc1.similarity(doc2)

0.8133191041632585

In [0]:
from gensim.models import Word2Vec



In [0]:
sentences = df['ImportantTopic'].tolist()

In [0]:
import gensim
model = gensim.models.Word2Vec([sentences],min_count=1,size=32)

In [0]:
print (model.most_similar(positive=['great condition'], negative=[], topn=2))

[('straight talk matt', 0.6311078071594238), ('simple task im', 0.5678778886795044)]


  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


In [0]:
X = model[model.wv.vocab]

  """Entry point for launching an IPython kernel.


In [0]:
from sklearn import cluster
from sklearn import metrics
kmeans = cluster.KMeans(n_clusters=5)
kmeans.fit(X)
 
labels = kmeans.labels_
centroids = kmeans.cluster_centers_
 
# print ("Cluster id labels for inputted data")
# print (labels)
# print ("Centroids data")
# print (centroids)
 
print ("Score (Opposite of the value of X on the K-means objective which is Sum of distances of samples to their closest cluster center):")
print (kmeans.score(X))
 
silhouette_score = metrics.silhouette_score(X, labels, metric='euclidean')
 
print ("Silhouette_score: ")
print (silhouette_score)


Score (Opposite of the value of X on the K-means objective which is Sum of distances of samples to their closest cluster center):
-3.3837767
Silhouette_score: 
0.022312539


In [0]:
words = list(model.wv.vocab)
for i, word in enumerate(words):  
    print (word + ":" + str(labels[i]))

new phone:4
sim card:2
great phone:0
good condition:4
great condition:4
perfect condition:0
home button:1
apple store:4
great product:3
works great:2
good phone:4
battery life:1
iphone 4:3
iphone 4s:1
excellent condition:4
great price:1
new iphone:0
excellent product:0
brand new:4
verizon phone:1
nice phone:0
phone came:3
great shape:3
good product:1
great deal:0
cell phone:1
new one:3
first iphone:2
serial number:4
like new:0
good price:4
straight talk:2
works perfectly:4
good deal:1
brand new phone:2
old phone:1
phone works great:0
used phone:3
full charge:3
would recommend:1
good shape:4
new battery:2
long time:4
cant use:0
excellent phone:4
original box:1
unlocked phone:0
money back:0
verizon store:4
works fine:0
dont know:1
sprint phone:3
full refund:1
great purchase:1
original packaging:2
first time:1
refurbished phone:4
quick delivery:1
replacement phone:4
iphone 5:0
worked great:1
works well:4
phone would:1
big deal:0
phone arrived:1
tmobile sim card:0
water damage:0
works good

In [0]:
# df = pd.DataFrame.from_dict(eval(r.text), orient='columns')
df['ImportantEntity'] = df.ImportantEntity.astype(str)
df['EntityAttributes'] = df.EntityAttributes.astype(str)
df = df[df['EntityAttributes'] != "[]"]
# type(df['ImportantEntity'][0])
df[df['ImportantEntity'].str.contains("condition")]
# df[df['ImportantTopic'].str.contains("shipping")]
# df[df['ImportantVerbEntity'].str.contains("definitely")]

Unnamed: 0.1,Unnamed: 0,EntityAttributes,ImportantEntity,ImportantTopic,ImportantVerbEntity,Strength%,TopicSentiment,TopicSubjectivity,TotalCountAllAnswer
3,3,['good'],['condition'],good condition,[],0.633187,0.7,0.6,392
4,4,['great'],['condition'],great condition,[],0.486197,0.8,0.75,301
5,5,['perfect'],['condition'],perfect condition,[],0.458738,1.0,1.0,284
14,14,['excellent'],['condition'],excellent condition,[],0.285904,1.0,1.0,177
69,69,['new'],['condition'],new condition,[],0.067842,0.14,0.45,42
70,70,['pristine'],['condition'],pristine condition,[],0.066226,0.0,0.0,41
239,239,['amazing'],['condition'],amazing condition,[],0.029075,0.6,0.9,18
280,280,['good'],['conditions'],good conditions,[],0.025844,0.7,0.6,16
323,323,['excellant'],['condition'],excellant condition,[],0.022614,0.0,0.0,14
341,341,['awesome'],['condition'],awesome condition,[],0.022614,1.0,1.0,14


In [0]:
pd.DataFrame.from_dict(eval(r.text), orient='columns').to_csv("AmazonReviewBig_Test.csv",encoding = 'utf-8')

UnicodeEncodeError: ignored

## Kill Port, If Hangs <Flask Port>

In [0]:
!ps -fA | grep python

root          34      24  0 06:16 ?        00:00:13 /usr/bin/python2 /usr/local/bin/jupyter-notebook --ip="172.28.0.2" --port=9000 --FileContentsManager.root_dir="/" --MappingKernelManager.root_dir="/content"
root         132      34  0 06:17 ?        00:02:04 /usr/bin/python3 -m ipykernel_launcher -f /root/.local/share/jupyter/runtime/kernel-e9e029f6-8ac6-48c4-8441-0b67434c274f.json
root        1896      34  0 10:38 ?        00:00:02 /usr/bin/python3 -m ipykernel_launcher -f /root/.local/share/jupyter/runtime/kernel-c9555fae-256a-4ece-86f0-8b964fcd5530.json
root        2060    1896  6 10:41 ?        00:00:32 /usr/bin/python3 -m flask run
root        2065    2060  0 10:41 ?        00:00:00 /usr/bin/python3 -Wignore:::pip._internal.cli.base_command -c from multiprocessing.semaphore_tracker import main;main(3)
root        2208    1896  0 10:49 ?        00:00:00 /bin/bash -c ps -fA | grep python


In [0]:
!kill 2060

# Production Service -> Threshold Topic Similarity -> Gets Top N Topics Similairty -> Working

## Server

In [0]:
%%writefile server_similarity.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import socket
print(socket.gethostbyname(socket.getfqdn(socket.gethostname())))
from multiprocessing import Pool
import multiprocessing
import scipy.sparse as sp
import collections
from flask import Flask
import pandas as pd
from rake_nltk import Metric, Rake
import spacy
from textblob import TextBlob
from flask import Flask, request, render_template
import pandas as pd
import string
import json
from textblob.np_extractors import ConllExtractor
from flask_jsonpify import jsonpify
from multiprocessing import Pool
import scipy.sparse as sp
import numpy as np
import pickle
import swifter


app = Flask(__name__)


def getSimilarity_topic(eachValue):
  SimilarityDict = {}
  SimilarityList = []
  SortedNumberValue = sorted( [(x,i) for (i,x) in enumerate(ImportantTopicsDF['Strength%'])], reverse=True )[:SortNumber][SortNumber-1][0]
  if ImportantTopicsDF['Strength%'][(ImportantTopicsDF['ImportantTopic'] == eachValue)].values[0] >= SortedNumberValue:
#     print ("Inside Important Topic", eachValue)
    for eachTopic in TopicList:
      SimilarityValue = float("{0:.2f}".format(nlp(eachValue).similarity(nlp(eachTopic))))
      if SimilarityValue > 0.90 and SimilarityValue < 1:
        StrengthValue = float("{0:.2f}".format(ImportantTopicsDF.loc[ImportantTopicsDF['ImportantTopic'] == eachTopic]['Strength%'].values[0]))
        SentimentValue = float("{0:.2f}".format(TextBlob(eachTopic).sentiment.polarity))
        SimilarityList.append([(eachTopic,SimilarityValue), ('Strength', StrengthValue ), ('sentiment', SentimentValue) ])
#         print (SimilarityList)
  SimilarityDict[eachValue] = SimilarityList
#   print (SimilarityDict)
  return SimilarityDict

import numpy as np
from multiprocessing import Pool
import scipy.sparse as sp
import multiprocessing
import swifter

num_partitions = 4
num_cores = multiprocessing.cpu_count()

 
def parallelize_dataframe(df, func):
    df_split = np.array_split(df, num_partitions)
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df
  
def ParallelizeDFOperation(ImportantTopicsDF):
  global SortNumber
  SortNumber = 10
  ImportantTopicsDF['TopicSimilarity'] = ImportantTopicsDF['ImportantTopic'].apply(getSimilarity_topic)
  return ImportantTopicsDF
  

def callParallelDFOperationFunction():
  return ParallelizeDFOperation(ImportantTopicsDF)

def getTopicList(PicklePath):
  with open(PicklePath, "rb") as f:
    return pickle.load(f)
  
def readData(path):
    # Read NPZ and Transform to Datafram. Single Question Responses. Sample is on 51 Records
    return pd.read_csv(path)
  
def loadSpacyModel(ModelName):
    return spacy.load(ModelName)

def Dataframe2JsonObject(dataframe):
#     df_list = dataframe.values.tolist()
#     JSONP_data = jsonpify(df_list)
#     return JSONP_data
    return dataframe.to_json(orient='records')

def getTopicListfromDF(Dataframe):
  return Dataframe['ImportantTopic'].tolist()

@app.route('/')
def SurveyClusterNTopic():
    global ImportantTopicsDF
    ImportantTopicsDF = readData('AmazonReviewOutput.csv')
    global TopicList
    TopicList = getTopicListfromDF(ImportantTopicsDF)
    
    
    global nlp
    ModelName = 'en_core_web_md'
    nlp = loadSpacyModel(ModelName)
    
    global num_partitions
    num_partitions = 4
    global num_cores
    
    global SortNumber
    SortNumber = 5
    global ParallelizeDFOperation
    num_cores = multiprocessing.cpu_count()
#     data = parallelize(ImportantTopicsDF, ParallelizeDFOperation)
    ImportantTopicsDF = parallelize_dataframe(ImportantTopicsDF, ParallelizeDFOperation)
    return Dataframe2JsonObject(ImportantTopicsDF)
if __name__ == '__main__':
    # nlp = spacy.load("en_core_web_sm")
    app.run(threaded=True)
    
    

Overwriting server_similarity.py


## Server Run

In [0]:
import os
import subprocess
import sys
import time
import socket
# Start a subprocess that runs the Flask server
p = subprocess.Popen([sys.executable, "-m", "flask", "run"], env=dict(**os.environ, FLASK_APP="server_similarity.py"), stdout=subprocess.PIPE, stderr=subprocess.PIPE)

# Start two subthreads that forward the output from the Flask server to the output of the Jupyter notebook
def forward(i, o):
    while p.poll() is None:
        l = i.readline().decode('utf-8')
        if l:
            o.write("[SERVER] " + l)

import threading
threading.Thread(target=forward, args=(p.stdout, sys.stdout)).start()
threading.Thread(target=forward, args=(p.stderr, sys.stderr)).start()
import socket
print(socket.gethostbyname(socket.getfqdn(socket.gethostname())))
# sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
# Let's give the server a bit of time to make sure it has started
time.sleep(2)

172.28.0.2
[SERVER]  * Serving Flask app "server_similarity.py"
[SERVER]  * Environment: production
[SERVER]    Use a production WSGI server instead.
[SERVER]  * Debug mode: off


[SERVER]  * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)


## Request

In [0]:
import requests
# r = requests.get("http://172.28.0.2/")
r = requests.get(" http://127.0.0.1:5000/")
print(r.status_code)
print(r.encoding)
print(r.apparent_encoding)
print(r.text)

[SERVER] 172.28.0.2
200
utf-8
ascii
[{"Unnamed: 0":0,"EntityAttributes":"['great']","ImportantEntity":"['tablet']","ImportantTopic":"great tablet","Strength%":1.7616972912,"TopicSentiment":0.8,"TopicSubjectivity":0.75,"TotalCountAllAnswer":93,"TopicSimilarity":{"great tablet":[[["good tablet",0.96],["Strength",0.64],["sentiment",0.7]],[["great little tablet",0.93],["Strength",0.19],["sentiment",0.31]],[["perfect tablet",0.92],["Strength",0.15],["sentiment",1.0]],[["excellent tablet",0.93],["Strength",0.11],["sentiment",1.0]],[["nice tablet",0.92],["Strength",0.08],["sentiment",0.6]],[["better tablet",0.91],["Strength",0.08],["sentiment",0.5]],[["fantastic tablet",0.95],["Strength",0.06],["sentiment",0.4]],[["best tablet",0.92],["Strength",0.04],["sentiment",1.0]],[["terrific tablet",0.93],["Strength",0.04],["sentiment",0.0]],[["great tablet use",0.94],["Strength",0.04],["sentiment",0.8]],[["great inexpensive tablet",0.92],["Strength",0.04],["sentiment",0.8]],[["tablet work great",0.94]

[SERVER] 127.0.0.1 - - [22/Apr/2019 10:24:24] "[37mGET / HTTP/1.1[0m" 200 -


In [0]:
pd.DataFrame.from_dict(eval(r.text), orient='columns').to_csv("AmazonReviewTopicSimilairty.csv")

#Production Service -> Topic Similairty  -> Not Working

##Server

In [0]:
%%writefile server_similarity.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import socket
print(socket.gethostbyname(socket.getfqdn(socket.gethostname())))
from multiprocessing import Pool
import multiprocessing
import scipy.sparse as sp
import collections
from flask import Flask
import pandas as pd
from rake_nltk import Metric, Rake
import spacy
from textblob import TextBlob
from flask import Flask, request, render_template
import pandas as pd
import string
import json
from textblob.np_extractors import ConllExtractor
from flask_jsonpify import jsonpify
from multiprocessing import Pool
import scipy.sparse as sp
import numpy as np
import pickle
import swifter


app = Flask(__name__)

def readData(path):
    # Read NPZ and Transform to Datafram. Single Question Responses. Sample is on 51 Records
    return pd.read_csv(path)
  
def loadSpacyModel(ModelName):
    return spacy.load(ModelName)

def Dataframe2JsonObject(dataframe):
#     df_list = dataframe.values.tolist()
#     JSONP_data = jsonpify(df_list)
#     return JSONP_data
    return dataframe.to_json(orient='records')

def getSimilarity_topic(eachValue):
  SimilarityDict = {}
  SimilarityList = []
  for eachTopic in TopicList:
    try:
      SimilarityValue = float("{0:.2f}".format(nlp(eachValue).similarity(nlp(eachTopic))))
      if SimilarityValue > 0.90 and SimilarityValue < 1:
        StrengthValue = float("{0:.2f}".format(ImportantTopicsDF.loc[ImportantTopicsDF['ImportantTopic'] == eachTopic]['Strength%'].values[0]))
        SentimentValue = float("{0:.2f}".format(TextBlob(eachTopic).sentiment.polarity))
        SimilarityList.append([(eachTopic,SimilarityValue), ('Strength', StrengthValue ), ('sentiment', SentimentValue) ])
    except:
      SimilarityList =[]
  SimilarityDict[eachValue] = SimilarityList
#   print (SimilarityDict)
  return SimilarityDict

def parallelize(data, func):
#     a,b,c,d,e,f,g,h,i,j = np.array_split(data, num_partitions)
    a = np.array_split(data, num_partitions)
    pool = Pool(num_cores)
#     df = pd.concat(pool.map(func, [a,b,c,d,e,f,g,h,i,j ]))
    df = sp.vstack(pool.map(func, a), format='csr')
    pool.close()
    pool.join()
    return df
  
def ParallelizeDFOperation(ImportantTopicsDF):
  ImportantTopicsDF['TopicSimilarity'] = ImportantTopicsDF['ImportantTopic'].apply(getSimilarity_topic)
  return ImportantTopicsDF

def callParallelDFOperationFunction():
  return ParallelizeDFOperation(ImportantTopicsDF)

def getTopicList(PicklePath):
  with open(PicklePath, "rb") as f:
    return pickle.load(f)

@app.route('/')
def SurveyAnalysis():
    
    global TopicList
    TopicList = getTopicList("TopicListDump.dat")
    global ImportantTopicsDF
    ImportantTopicsDF = readData('ImportantTopicsDF.csv')
    
    global nlp
    ModelName = 'en_core_web_md'
    
    global num_partitions
    num_partitions = 4
    global num_cores
    
    global ParallelizeDFOperation
    num_cores = multiprocessing.cpu_count()
    data = parallelize(ImportantTopicsDF, ParallelizeDFOperation)
    
    
    return Dataframe2JsonObject(data)

if __name__ == '__main__':
    # nlp = spacy.load("en_core_web_sm")
    app.run(threaded=True)

# import threading
# threading.Thread(target=app.run, kwargs={'host':'0.0.0.0','port':80}).start() 

Overwriting server_similarity.py


## MultiProcess Server Run

In [0]:
import os
import subprocess
import sys
import time
import socket
# Start a subprocess that runs the Flask server
p = subprocess.Popen([sys.executable, "-m", "flask", "run"], env=dict(**os.environ, FLASK_APP="server_similarity.py"), stdout=subprocess.PIPE, stderr=subprocess.PIPE)

# Start two subthreads that forward the output from the Flask server to the output of the Jupyter notebook
def forward(i, o):
    while p.poll() is None:
        l = i.readline().decode('utf-8')
        if l:
            o.write("[SERVER] " + l)

import threading
threading.Thread(target=forward, args=(p.stdout, sys.stdout)).start()
threading.Thread(target=forward, args=(p.stderr, sys.stderr)).start()
import socket
print(socket.gethostbyname(socket.getfqdn(socket.gethostname())))
# sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
# Let's give the server a bit of time to make sure it has started
time.sleep(2)

172.28.0.2
[SERVER]  * Serving Flask app "server_similarity.py"
[SERVER]  * Environment: production
[SERVER]    Use a production WSGI server instead.
[SERVER]  * Debug mode: off


[SERVER]  * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)


## Request

In [0]:
import requests
# r = requests.get("http://172.28.0.2/")
r = requests.get(" http://127.0.0.1:5000/")
print(r.status_code)
print(r.encoding)
print(r.apparent_encoding)
print(r.text)

[SERVER] 172.28.0.2
500
ISO-8859-1
ascii
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
<title>500 Internal Server Error</title>
<h1>Internal Server Error</h1>
<p>The server encountered an internal error and was unable to complete your request. Either the server is overloaded or there is an error in the application.</p>



[SERVER] [2019-04-18 11:08:49,694] ERROR in app: Exception on / [GET]
[SERVER] Traceback (most recent call last):
[SERVER]   File "/usr/local/lib/python3.6/dist-packages/flask/app.py", line 2292, in wsgi_app
[SERVER]     response = self.full_dispatch_request()
[SERVER]   File "/usr/local/lib/python3.6/dist-packages/flask/app.py", line 1815, in full_dispatch_request
[SERVER]     rv = self.handle_user_exception(e)
[SERVER]   File "/usr/local/lib/python3.6/dist-packages/flask/app.py", line 1718, in handle_user_exception
[SERVER]     reraise(exc_type, exc_value, tb)
[SERVER]   File "/usr/local/lib/python3.6/dist-packages/flask/_compat.py", line 35, in reraise
[SERVER]     raise value
[SERVER]   File "/usr/local/lib/python3.6/dist-packages/flask/app.py", line 1813, in full_dispatch_request
[SERVER]     rv = self.dispatch_request()
[SERVER]   File "/usr/local/lib/python3.6/dist-packages/flask/app.py", line 1799, in dispatch_request
[SERVER]     return self.view_functions[rule.endpoint](**req

## Request 2 Dataframe 

## Kill Port 

In [0]:
!ps -fA | grep python

root          32      22  0 06:16 ?        00:00:05 /usr/bin/python2 /usr/local/bin/jupyter-notebook --ip="172.28.0.2" --port=9000 --FileContentsManager.root_dir="/" --MappingKernelManager.root_dir="/content"
root         137      32  0 06:17 ?        00:00:24 /usr/bin/python3 -m ipykernel_launcher -f /root/.local/share/jupyter/runtime/kernel-aba16ff0-59c5-4cb7-a7a7-7d72427bcc62.json
root         284     137  0 06:23 ?        00:00:00 /usr/bin/python3 -Wignore:::pip._internal.cli.base_command -c from multiprocessing.semaphore_tracker import main;main(56)
root         594     137  0 07:38 ?        00:00:00 /bin/bash -c ps -fA | grep python
root         596     594  0 07:38 ?        00:00:00 grep python


In [0]:
!kill 563

# Experimental Codes

In [0]:
InportantTopicsDF.to_csv("InportantTopicsDF.csv")

In [0]:
InportantTopicsDF.head(5)

Unnamed: 0,index,TotalCountAllAnswer,Strength%
22,spicy taste good,67,16.301703
17,savory taste good,60,14.59854
11,sour taste good,49,11.922141
10,sweetness taste good,41,9.975669
14,viscositys good,34,8.272506


In [0]:
InportantTopicsDF

In [0]:
## For each Entity; groupby and then give sentiment ; taste - good(10), bad(2), strong(2)   -> Sentiment 

In [0]:
flat_list = [item for sublist in list(data['Topics']) for item in sublist]

In [0]:
len(flat_list)

1043

In [0]:
len(set(flat_list))

342

In [0]:
import collections
counter=collections.Counter(flat_list)
print(counter)

Counter({'spicy taste good': 67, 'savory taste good': 60, 'sour taste good': 49, 'sweetness taste good': 41, 'viscositys good': 34, 'salty taste good': 30, 'chilli smells strong': 26, 'spicy taste': 23, 'garlic taste good': 20, 'textures soft': 19, 'garlic taste': 17, 'garlic smells': 16, 'chilli smells good': 16, 'salty taste': 14, 'spicy taste strong': 14, 'sour taste': 13, 'garlic smells good': 11, 'garlic smells strong': 9, 'garlic smell': 7, 'little bit': 7, 'real chilli': 6, 'chilli smells strong good': 6, 'colour need': 6, 'light red': 5, 'colour needs': 5, 'viscosity isnt good enough': 5, 'garlic taste strong': 4, 'light red colour': 4, 'red colour': 4, 'vicositys good': 4, 'sauce smells good': 4, 'sour sweet': 4, 'spicy stays long': 4, 'savory taste': 4, 'feel nauseous': 3, 'garlic smell strong': 3, 'chilli smell fresh': 3, 'light red colours attractive': 3, 'appearance colours bright': 3, 'tastes unique': 3, 'light colour': 3, 'sweetness taste strong': 3, 'taste good': 3, 'co

In [0]:
df = pd.DataFrame.from_dict(counter, orient='index').reset_index()
df = df.sort_values(by=[0], ascending=False)

In [0]:
df.head(20)

Unnamed: 0,index,0
22,spicy taste good,67
17,savory taste good,60
11,sour taste good,49
10,sweetness taste good,41
14,viscositys good,34
34,salty taste good,30
2,chilli smells strong,26
3,spicy taste,23
18,garlic taste good,20
20,textures soft,19


In [0]:
import string
nlp = spacy.load("en_core_web_lg")
import itertools
for a, b in itertools.combinations(set(flat_list), 2):
  a = a.translate(str.maketrans('', '', string.punctuation))
  b = b.translate(str.maketrans('', '', string.punctuation))
  doc1 = nlp(a)
  doc2 = nlp(b)
  if (doc1.similarity(doc2)) > 0.90:
    print (doc1.similarity(doc2), "--", a, "----", b)

0.9398877440476473 -- soup needs ---- soup need
0.9097327742899014 -- real chilli smells good ---- chilli smells need
0.9468863610712461 -- real chilli smells good ---- real chilli smells fresh
0.9084437126782787 -- real chilli smells good ---- real chilli taste strong
0.9093422154720056 -- real chilli smells good ---- chilli taste real
0.9204865486419873 -- real chilli smells good ---- chilli smell strong good
0.9381872755771946 -- real chilli smells good ---- chilli smells strong good
0.962028227751544 -- real chilli smells good ---- real chilli smells strong
0.91712002401567 -- real chilli smells good ---- sauce smells good good
0.9661184575054693 -- real chilli smells good ---- real chilli smells
0.946597454450995 -- real chilli smells good ---- real chilli taste good
0.938187274397767 -- real chilli smells good ---- chilli smells good strong
0.9053381403485958 -- real chilli smells good ---- smells like chilli
0.9661183835609269 -- real chilli smells good ---- chilli smells real
0

KeyboardInterrupt: ignored

In [0]:
import os
import subprocess
import sys
import time
import socket
# Start a subprocess that runs the Flask server
p = subprocess.Popen([sys.executable, "-m", "flask", "run"], env=dict(**os.environ, FLASK_APP="server.py"), stdout=subprocess.PIPE, stderr=subprocess.PIPE)

# Start two subthreads that forward the output from the Flask server to the output of the Jupyter notebook
def forward(i, o):
    while p.poll() is None:
        l = i.readline().decode('utf-8')
        if l:
            o.write("[SERVER] " + l)

import threading
threading.Thread(target=forward, args=(p.stdout, sys.stdout)).start()
threading.Thread(target=forward, args=(p.stderr, sys.stderr)).start()
import socket
print(socket.gethostbyname(socket.getfqdn(socket.gethostname())))
# sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
# Let's give the server a bit of time to make sure it has started
time.sleep(2)

In [0]:
# reduce the dark colour of the sauce// need to be more spicy// the texture of the sauce need to be softer

import spacy

nlp = spacy.load("en_core_web_lg")
doc = nlp(u"reduce the dark colour of the sauce// need to be more spicy// the texture of the sauce need to be softer")

In [0]:
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

reduce reduce VERB VB ROOT xxxx True False
the the DET DT det xxx True False
dark dark ADJ JJ amod xxxx True False
colour colour NOUN NN dobj xxxx True False
of of ADP IN prep xx True False
the the DET DT det xxx True False
sauce// sauce// NOUN NN pobj xxxx// False False
need need VERB VBP dobj xxxx True False
to to PART TO aux xx True False
be be VERB VB xcomp xx True False
more more ADJ JJR advmod xxxx True False
spicy// spicy// ADP IN attr xxxx// False False
the the DET DT det xxx True False
texture texture NOUN NN attr xxxx True False
of of ADP IN prep xx True False
the the DET DT det xxx True False
sauce sauce NOUN NN pobj xxxx True False
need need VERB VBP dep xxxx True False
to to PART TO aux xx True False
be be VERB VB xcomp xx True False
softer soft ADJ JJR acomp xxxx True False


In [0]:
noun_adj_pairs = []
for i,token in enumerate(doc):
    if token.pos_ not in ('NOUN','PROPN','VERB'):
        continue
    for j in range(i+1,len(doc)):
        if doc[j].pos_ == 'ADJ':
            noun_adj_pairs.append((token,doc[j]))
            break
noun_adj_pairs

[(reduce, dark),
 (colour, more),
 (sauce//, more),
 (need, more),
 (be, more),
 (texture, softer),
 (sauce, softer),
 (need, softer),
 (be, softer)]

In [0]:
data.columns

Index(['D3a', 'Translate', 'Topics', 'NounAdjectivePair', 'CleanTranslate'], dtype='object')

In [0]:
NounAdjectivePair = [item for sublist in list(data['NounAdjectivePair']) for item in sublist]


In [0]:
len(NounAdjectivePair)

2048

In [0]:
NounAdjectivePair[0:20]

[('taste', 'enough'),
 ('is', 'enough'),
 ('taste', 'enough'),
 ('is', 'enough'),
 ('taste', 'sour'),
 ('taste', 'enough'),
 ('is', 'enough'),
 ('sweetness', 'enough'),
 ('is', 'enough'),
 ('viscosity', 'enough'),
 ('is', 'enough'),
 ('colour', 'bright'),
 ('is', 'bright'),
 ('expected', 'good'),
 ('taste', 'good'),
 ('is', 'good'),
 ('does', 'nauseous'),
 ('make', 'nauseous'),
 ('feel', 'nauseous'),
 ('garlic', 'strong')]

In [0]:
len(list(set(NounAdjectivePair)))

562

In [0]:
from collections import Counter

counts = Counter((NounAdjectivePair))
print(counts)

Counter({('taste', 'good'): 285, ('taste', 'strong'): 85, ('chilli', 'strong'): 55, ('smells', 'strong'): 55, ('sweetness', 'good'): 47, ('smells', 'good'): 47, ('garlic', 'strong'): 44, ('chilli', 'good'): 37, ('viscositys', 'good'): 35, ('is', 'good'): 34, ('are', 'good'): 29, ('is', 'strong'): 28, ('garlic', 'good'): 24, ('textures', 'soft'): 18, ('spicy', 'good'): 17, ('taste', 'sour'): 16, ('smell', 'strong'): 15, ('texture', 'soft'): 15, ('garlic', 'sour'): 15, ('viscosity', 'good'): 14, ('colour', 'bright'): 13, ('sauce', 'good'): 13, ('is', 'bright'): 12, ('is', 'soft'): 12, ('colours', 'bright'): 12, ('garlic', 'spicy'): 12, ('taste', 'spicy'): 12, ('sweetness', 'savory'): 12, ('smell', 'good'): 12, ('sweetness', 'sour'): 11, ('chilli', 'fresh'): 10, ('taste', 'garlic'): 10, ('chilli', 'sour'): 10, ('taste', 'real'): 10, ('good', 'good'): 9, ('spicy', 'strong'): 9, ('good', 'strong'): 9, ('are', 'strong'): 9, ('colour', 'good'): 8, ('is', 'light'): 8, ('be', 'darker'): 8, ('ad

In [0]:
df = pd.DataFrame.from_dict(counts, orient='index').reset_index()
df = df.sort_values(by=[0], ascending=False)

In [0]:
df

Unnamed: 0,index,0
8,"(taste, good)",285
22,"(taste, strong)",85
16,"(smells, strong)",55
15,"(chilli, strong)",55
19,"(sweetness, good)",47
32,"(smells, good)",47
13,"(garlic, strong)",44
41,"(chilli, good)",37
23,"(viscositys, good)",35
9,"(is, good)",34


In [0]:
import string
nlp = spacy.load("en_core_web_lg")
import itertools
for a, b in itertools.combinations(set(NounAdjectivePair), 2):
  doc1 = nlp(" ".join(list(a)))
  doc2 = nlp(" ".join(list(b)))
  if (doc1.similarity(doc2)) > 0.90:
    print (doc1.similarity(doc2), "--", a, "----", b)

0.9365073991899543 -- ('need', 'darker') ---- ('needs', 'darker')
0.9082355172906906 -- ('need', 'darker') ---- ('be', 'darker')
0.938455445932055 -- ('good', 'garlic') ---- ('garlic', 'enough')
0.9051077355808526 -- ('good', 'garlic') ---- ('garlic', 'strong')
1.0000000005207719 -- ('good', 'garlic') ---- ('garlic', 'good')
0.9101404422584927 -- ('good', 'garlic') ---- ('need', 'garlic')
0.9022057605549657 -- ('good', 'garlic') ---- ('be', 'garlic')
0.932361584062914 -- ('be', 'attractive') ---- ('can', 'attractive')
0.9182084090068708 -- ('be', 'attractive') ---- ('is', 'attractive')
1.0000000440557197 -- ('savory', 'good') ---- ('good', 'savory')
0.9277344031312746 -- ('savory', 'good') ---- ('savory', 'enough')
0.9403574486073365 -- ('make', 'sour') ---- ('want', 'sour')
0.9039130908833113 -- ('make', 'sour') ---- ('good', 'sour')
0.9080168541357858 -- ('make', 'sour') ---- ('be', 'sour')
0.9039130908833113 -- ('make', 'sour') ---- ('sour', 'good')
0.9337268080484242 -- ('make', 's

KeyboardInterrupt: ignored

In [0]:
import pickle 

infile = open("data.pickle",'rb')
l = pickle.load(infile)

In [0]:
l

['This product so far has not disappointed. My children love to use it and I like the ability to monitor control what content they see with ease.',
 'great for beginner or experienced person. Bought as a gift and she loves it',
 'Inexpensive tablet for him to use and learn on, step up from the NABI. He was thrilled with it, learn how to Skype on it already...',
 "I've had my Fire HD 8 two weeks now and I love it. This tablet is a great value.We are Prime Members and that is where this tablet SHINES. I love being able to easily access all of the Prime content as well as movies you can download and watch laterThis has a 1280/800 screen which has some really nice look to it its nice and crisp and very bright infact it is brighter then the ipad pro costing $900 base model. The build on this fire is INSANELY AWESOME running at only 7.7mm thick and the smooth glossy feel on the back it is really amazing to hold its like the futuristic tab in ur hands.",
 'I bought this for my grand daughter 