In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import string
import math
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from wordcloud import WordCloud, ImageColorGenerator
from sklearn.decomposition import LatentDirichletAllocation, NMF
import nltk
from collections import Counter
import textblob            #to import
from textblob import TextBlob
from nltk.tokenize import sent_tokenize
import seaborn as sns

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

In [3]:
# Importing dataset
df = pd.read_csv("https://raw.githubusercontent.com/dzindili/Master-thesis/main/textdata.csv")
df.head()

Unnamed: 0,Text
0,"enormous tropical forests, little known to man..."
1,"\n\n\nThere is not in Italy, they say (and I b..."
2,\nSection 1.\n\nPHYSICK FOR THE SICKNESSE.\nTH...
3,\n\n\nReligion is the greatest enemy to religi...
4,\nThese unspeakable benefits which ye have co...


In [None]:
#remove \n
for i in df:
    df[i] = df[i].str.replace('\n', '')


In [None]:
text = (df.iloc[:,0])

df['sentences'] = text.apply(lambda x: re.split(r'[.!?]+', x))

df['sentence_count_RE'] = text.apply(lambda x: len(re.split(r'[.!?]+', x)))
df['doc_num'] = range(1, 1+len(df))
df

In [None]:
x = df['doc_num']
y = df['sentence_count_RE']

In [None]:
plt.figure(figsize=(50,30))
plt.margins(0.03)
plt.xlabel('Document', fontsize=70)
plt.xticks(fontsize=60)
plt.ylabel('Sentence frequency', fontsize=70)
plt.yticks(fontsize=60)
plt.title('Sentence Count for each document', fontsize=80)


ax = sns.barplot(x, y, palette="Blues_d")
plt.xticks([10,20,30,40,50,60,70,80],  [10,20,30,40,50,60,70,80])
sns.set_context("poster")

In [None]:
sum(df['sentence_count_RE'])

4241

In [None]:
max(df['sentence_count_RE'])

317

In [None]:
min(df['sentence_count_RE'])

6

In [5]:
#remove puncuation
for i in df:
    df[i] = df[i].str.replace(r'[^\w\s]+', '')

  This is separate from the ipykernel package so we can avoid doing imports until


In [6]:
#remove all numbers
for i in df:
    df[i] = df[i].apply(lambda x: re.sub('W*dw*','',x))

In [7]:
#make all words lowercase
for i in df:
    df[i] = df[i].apply(lambda x:x.lower())

In [8]:
#remove stopwords
stop_words = set(stopwords.words('english'))
stop_words.add('subject')
stop_words.add('http')

def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in stop_words])

for i in df:
    df[i] = df[i].apply(lambda x: remove_stopwords(x))

In [9]:
#lemmetization
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

for i in df:
    df[i] = df[i].apply(lambda text: lemmatize_words(text))

In [10]:
text = (df.iloc[:,0])

In [11]:
for i in df:
    df[i] = df[i].str.split()

In [None]:
df['lemma_str'] = [' '.join(map(str,l)) for l in text]
df.head()

Unnamed: 0,Text,lemma_str
0,"[enormous, tropical, forest, little, known, ma...",enormous tropical forest little known manan ga...
1,"[italy, say, believe, lovelier, resience, pala...",italy say believe lovelier resience palazzo pe...
2,"[section, 1physick, sicknessethe, wor, plague,...",section 1physick sicknessethe wor plague engli...
3,"[religion, greatest, enemy, religion, false, t...",religion greatest enemy religion false true fa...
4,"[unspeakable, benefit, ye, conceive, wor, ye, ...",unspeakable benefit ye conceive wor ye receive...


In [None]:
df['sentiment'] = df['lemma_str'].apply(lambda x: TextBlob(x).sentiment.polarity)
df.head()

Unnamed: 0,Text,lemma_str,sentiment
0,"[enormous, tropical, forest, little, known, ma...",enormous tropical forest little known manan ga...,0.308886
1,"[italy, say, believe, lovelier, resience, pala...",italy say believe lovelier resience palazzo pe...,0.201321
2,"[section, 1physick, sicknessethe, wor, plague,...",section 1physick sicknessethe wor plague engli...,0.166927
3,"[religion, greatest, enemy, religion, false, t...",religion greatest enemy religion false true fa...,0.207803
4,"[unspeakable, benefit, ye, conceive, wor, ye, ...",unspeakable benefit ye conceive wor ye receive...,0.221223


In [None]:
plt.figure(figsize=(50,30))
plt.margins(0.02)
plt.xlabel('Sentiment', fontsize=50)
plt.xticks(fontsize=40)
plt.ylabel('Frequency', fontsize=50)
plt.yticks(fontsize=40)
plt.hist(df['sentiment'], bins=50)
plt.title('Sentiment Distribution', fontsize=60)
plt.show()

In [None]:
df['word_count'] = text.apply(lambda x: len(str(x).split()))
df['review_len'] = df['lemma_str'].astype(str).apply(len)

In [None]:
df['word_count'].sum()

52068

In [12]:
allwords = []
for wordlist in text:
    allwords += wordlist
print(allwords)



In [None]:
mostcommon = FreqDist(allwords).most_common(100)
wordcloud = WordCloud(width=1600, height=800, background_color='white').generate(str(mostcommon))
fig = plt.figure(figsize=(30,10), facecolor='white')
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')

plt.tight_layout(pad=0)
plt.show()

In [None]:
mostcommon_small = FreqDist(allwords).most_common(25)
x, y = zip(*mostcommon_small)
plt.figure(figsize=(50,30))
plt.margins(0.02)
plt.bar(x, y)
plt.xlabel('Words', fontsize=70)
plt.ylabel('Frequency of Words', fontsize=70)
plt.yticks(fontsize=70)
plt.xticks(rotation=60, fontsize=70)

plt.title('Frequency of 25 Most Common Words', fontsize=70)

x = np.array(x)
y = np.array(y) 

ax = sns.barplot(x, y, palette="Blues_d")
sns.set_context("poster")

In [25]:
data = {'Domain':['Household & Recipes', 'Medicine & Botany', 'Public Health',
'Science & Philosophy','Other', 'Theatre', 'Literature', 'Perfumes & Fashion', 'Religion','Travel & Ethnography'], 
'Count':[1, 4,1,2,3,1, 3,5,1,1]}
df = pd.DataFrame(data)  
df

Unnamed: 0,Domain,Count
0,Household & Recipes,1
1,Medicine & Botany,4
2,Public Health,1
3,Science & Philosophy,2
4,Other,3
5,Theatre,1
6,Literature,3
7,Perfumes & Fashion,5
8,Religion,1
9,Travel & Ethnography,1


In [None]:
x = df['Count']
y = df['Domain']

plt.figure(figsize=(50,30))
plt.margins(0.03)
plt.xlabel('Count', fontsize=70)
plt.xticks(fontsize=70)
plt.ylabel('Domain frequency', fontsize=70)
plt.yticks(fontsize=70)

ax = sns.barplot(x, y, palette="Blues_d")
sns.set_context("poster")