# Data science project with nlp analysis of text from W.E.B. Du Bois' The Souls of Black Folk

# IMPORT LIBRARIES AND DATA

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from jupyterthemes import jtplot
jtplot.style(theme='monokai', context='notebook', ticks=True, grid=False)
from wordcloud import WordCloud
import string
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
#python's lxml library parses xml and html files
from lxml import html
#python requests library gets data from web pages
import requests
from functools import reduce
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords

In [2]:
#download text from W.E.B. Du Bois' The Souls of Black Folk
page = requests.get('https://www.gutenberg.org/files/408/408-h/408-h.htm')
mytree = html.fromstring(page.content)
sobf1 = mytree.xpath('body/div[@class="chapter"]/p/text()')
sobf1

['\r\nHerein lie buried many things which if read with patience may show the strange\r\nmeaning of being black here at the dawning of the Twentieth Century. This\r\nmeaning is not without interest to you, Gentle Reader; for the problem of the\r\nTwentieth Century is the problem of the color line.\r\n',
 '\r\nI pray you, then, receive my little book in all charity, studying my words with\r\nme, forgiving mistake and foible for sake of the faith and passion that is in\r\nme, and seeking the grain of truth hidden there.\r\n',
 '\r\nI have sought here to sketch, in vague, uncertain outline, the spiritual world\r\nin which ten thousand thousand Americans live and strive. First, in two\r\nchapters I have tried to show what Emancipation meant to them, and what was its\r\naftermath. In a third chapter I have pointed out the slow rise of personal\r\nleadership, and criticized candidly the leader who bears the chief burden of\r\nhis race to-day. Then, in two other chapters I have sketched in swi

In [3]:
#use list comprehension to remove extra text in list
sobf1=[a.replace("\r\n"," ") for a in sobf1]
sobf1=[b.replace("Mr.","Mr") for b in sobf1]
sobf1=[c.replace("Mrs.","Mrs") for c in sobf1]
sobf1=[d.replace("MRS.","MRS") for d in sobf1]
sobf1=[e.replace("MR.","MR") for e in sobf1]
sobf1=[g.strip() for g in sobf1]
sobf1

['Herein lie buried many things which if read with patience may show the strange meaning of being black here at the dawning of the Twentieth Century. This meaning is not without interest to you, Gentle Reader; for the problem of the Twentieth Century is the problem of the color line.',
 'I pray you, then, receive my little book in all charity, studying my words with me, forgiving mistake and foible for sake of the faith and passion that is in me, and seeking the grain of truth hidden there.',
 'I have sought here to sketch, in vague, uncertain outline, the spiritual world in which ten thousand thousand Americans live and strive. First, in two chapters I have tried to show what Emancipation meant to them, and what was its aftermath. In a third chapter I have pointed out the slow rise of personal leadership, and criticized candidly the leader who bears the chief burden of his race to-day. Then, in two other chapters I have sketched in swift outline the two worlds within and without the V

In [4]:
#make each sentence an element in list
sobf1=[re.split('\. |\? |\! |\" ',h) for h in sobf1]
sobf = []
for inner_list in sobf1:
    for ele in inner_list:
        sobf.append(ele) 

In [5]:
sobf

['Herein lie buried many things which if read with patience may show the strange meaning of being black here at the dawning of the Twentieth Century',
 'This meaning is not without interest to you, Gentle Reader; for the problem of the Twentieth Century is the problem of the color line.',
 'I pray you, then, receive my little book in all charity, studying my words with me, forgiving mistake and foible for sake of the faith and passion that is in me, and seeking the grain of truth hidden there.',
 'I have sought here to sketch, in vague, uncertain outline, the spiritual world in which ten thousand thousand Americans live and strive',
 'First, in two chapters I have tried to show what Emancipation meant to them, and what was its aftermath',
 'In a third chapter I have pointed out the slow rise of personal leadership, and criticized candidly the leader who bears the chief burden of his race to-day',
 'Then, in two other chapters I have sketched in swift outline the two worlds within and w

In [6]:
#get number of elements or sentences in list
len(sobf)

2732

In [7]:
#turn list into a dictionary and into a dataframe
listdict={"text":sobf}
data=pd.DataFrame(listdict)

In [8]:
data.info

<bound method DataFrame.info of                                                    text
0     Herein lie buried many things which if read wi...
1     This meaning is not without interest to you, G...
2     I pray you, then, receive my little book in al...
3     I have sought here to sketch, in vague, uncert...
4     First, in two chapters I have tried to show wh...
...                                                 ...
2727  Let there spring, Gentle One, from out its lea...
2728  Let the ears of a guilty people tingle with tr...
2729  Thus in Thy good time may infinite reason turn...
2730                                                   
2731                                                   

[2732 rows x 1 columns]>

In [9]:
# Let's get the length of the sentences in the text
data['length']=data['text'].apply(len)
data

Unnamed: 0,text,length
0,Herein lie buried many things which if read wi...,146
1,"This meaning is not without interest to you, G...",134
2,"I pray you, then, receive my little book in al...",206
3,"I have sought here to sketch, in vague, uncert...",135
4,"First, in two chapters I have tried to show wh...",103
...,...,...
2727,"Let there spring, Gentle One, from out its lea...",116
2728,Let the ears of a guilty people tingle with tr...,186
2729,Thus in Thy good time may infinite reason turn...,123
2730,,0


In [10]:
#look at longest sentence
data[data['length']==np.max(data['length'])]['text'].iloc[0]

'Amid it all, two figures ever stand to typify that day to coming ages,—the one, a gray-haired gentleman, whose fathers had quit themselves like men, whose sons lay in nameless graves; who bowed to the evil of slavery because its abolition threatened untold ill to all; who stood at last, in the evening of life, a blighted, ruined form, with hate in his eyes;—and the other, a form hovering dark and mother-like, her awful face black with the mists of centuries, had aforetime quailed at that white master’s command, had bent in love over the cradles of his sons and daughters, and closed in death the sunken eyes of his wife,—aye, too, at his behest had laid herself low to his lust, and borne a tawny man-child to the world, only to see her dark boy’s limbs scattered to the winds by midnight marauders riding after “damned Niggers.” These were the saddest sights of that woful day; and no man clasped the hands of these two passing figures of the present-past; but, hating, they went to their lon

In [None]:
#look at rows with length less than or equal to 5
data[data['length']<=5].value_counts()

In [None]:
#drop rows with low lengths
data=data[data['length']>5]
data

In [None]:
data.info

# exploratory data analysis

In [None]:
#histogram of length variable
data['length'].plot(bins=50, kind='hist') 

In [None]:
# create log10 of length due to right skew of length column
data['lengthlog10']=np.log10(data['length'])
data['lengthlog10'].value_counts()

In [None]:
#histogram of logarithm bse 10 of length variable
data['lengthlog10'].plot(bins=50, kind='hist') 

In [None]:
data.describe()

In [None]:
#look at longest sentence
data[data['length']==3568]

# plot word cloud

In [None]:
#put text column into a single list
sentences=data['text'].tolist()
sentences

In [None]:
#get length of list
len(sentences)

In [None]:
#make list into a single string instead of havin gmultiple elements
sentences_as_one_string = " ".join(sentences)
sentences_as_one_string

In [None]:
#get number of characters and spaces in single string
len(sentences_as_one_string)

In [None]:
#plot word cloud
plt.figure(figsize=(20,20))
plt.imshow(WordCloud().generate(sentences_as_one_string))