In [81]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from collections import Counter
import csv
from nltk import ngrams, FreqDist


wnl     = nltk.WordNetLemmatizer()
stemmer = nltk.PorterStemmer()

stop_words = list(stopwords.words('english'))
sns.set()

In [82]:
stop_words[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [83]:
df = pd.read_csv('keywords.csv')

In [84]:
df.head()

Unnamed: 0,keywords
0,avoid jet lag flying to usa
1,effects of jet lag
2,how long does jet lag last
3,how to deal with jet lag from asia
4,how to prevent jet lag on long flights


In [85]:
df.drop_duplicates(inplace=True)

In [86]:
keywords = df['keywords'].tolist()

In [87]:
keywords[:2]

['avoid jet lag flying to usa', 'effects of jet lag']

In [88]:
lst = list()
dct = dict()

In [89]:
def stem_leman(keyword):
    tokens = nltk.word_tokenize(keyword)
    tokens = [i for i in tokens if i not in stop_words]
    tokens = [stemmer.stem(i) for i in tokens]
    
    for i in tokens:
        lst.append(i)

In [90]:
for i in keywords:
    stem_leman(i)

In [91]:
len(lst)

226927

In [92]:
lst[:10]

['avoid', 'jet', 'lag', 'fli', 'usa', 'effect', 'jet', 'lag', 'long', 'jet']

In [93]:
counter = Counter(lst)

In [94]:
len(counter)

7005

In [96]:
counter.most_common(10)

[('holiday', 7837),
 ('best', 4896),
 ('uk', 4587),
 ('place', 3390),
 ('travel', 3369),
 ('cheap', 2342),
 ('london', 2291),
 ('hotel', 2138),
 ('flight', 2104),
 ('2019', 1999)]

In [99]:
with open('word count.csv','w', encoding='utf-8', newline='') as csvfile:
    fieldnames = ['word', 'count', 'pos_tag']
    writer     =  csv.writer(csvfile)
    writer.writerow(fieldnames)
    for key, value in counter.items():
        writer.writerow([key, value, nltk.pos_tag([key])]) 

In [120]:
ngrams_counter = FreqDist(ngrams(lst, 2))

In [121]:
ngrams_counter.most_common(20)

[(('best', 'place'), 969),
 (('place', 'visit'), 687),
 (('new', 'york'), 609),
 (('last', 'minut'), 578),
 (('place', 'go'), 571),
 (('new', 'year'), 566),
 (('travel', 'insur'), 532),
 (('credit', 'card'), 490),
 (('citi', 'break'), 473),
 (('holiday', 'destin'), 460),
 (('hand', 'luggag'), 374),
 (('christma', 'market'), 370),
 (('thoma', 'cook'), 367),
 (('cheap', 'holiday'), 327),
 (('holiday', '2020'), 324),
 (('car', 'hire'), 315),
 (('break', 'uk'), 314),
 (('famili', 'holiday'), 308),
 (('bank', 'holiday'), 305),
 (('inclus', 'holiday'), 300)]

In [124]:
reduced_dct = {key:value for key, value in ngrams_counter.items() if value > 10}

In [128]:
with open('word count tuples.csv','w', encoding='utf-8', newline='') as csvfile:
    fieldnames = ['word', 'count']
    writer     =  csv.writer(csvfile)
    writer.writerow(fieldnames)
    for key, value in reduced_dct.items():
        writer.writerow([key, value]) 