In [1]:
import pandas as pd
import numpy as np

In [2]:
compiled = pd.read_csv('ATL_1941.csv')

Remove NaN values and replace with empty string

In [3]:
compiled = compiled.replace(np.nan, '')

Merge title, abstract and pargraph text and drop those columns

In [4]:
compiled['original_text'] = compiled['RecordTitle'] + ' ' + compiled['FullText'] + ' ' + compiled['Abstract']
compiled['text'] = compiled['RecordTitle'] + ' ' + compiled['FullText'] + ' ' + compiled['Abstract']
compiled.drop(columns=['RecordTitle', 'FullText', 'Abstract'])

Unnamed: 0.1,Unnamed: 0,RecordID,Publisher,AlphaPubDate,NumericPubDate,ObjectType,StartPage,original_text,text
0,135076,490635133,Atlanta Daily World,"Jan 31, 1941",19410131,Front Page/Cover Story,1,50 BTW Grads Get Diplomas This Evening ...,50 BTW Grads Get Diplomas This Evening ...
1,135104,490635186,Atlanta Daily World,"Jan 31, 1941",19410131,Advertisement,3,Display Ad 4 -- No Title ...,Display Ad 4 -- No Title ...
2,135143,490635240,Atlanta Daily World,"Jan 31, 1941",19410131,Image/Photograph,1,In 13th Defense ...,In 13th Defense ...
3,135152,490635257,Atlanta Daily World,"Jan 31, 1941",19410131,Front Page/Cover Story,1,"MAN, WOMAN FOUND GUILTY OF STABBING ...","MAN, WOMAN FOUND GUILTY OF STABBING ..."
4,135161,490635272,Atlanta Daily World,"Jan 31, 1941",19410131,Article,4,"RACE MEN PROMOTED, TO AID OUR PEOPLE IN JOB SE...","RACE MEN PROMOTED, TO AID OUR PEOPLE IN JOB SE..."
...,...,...,...,...,...,...,...,...,...
18492,158389,490673491,Atlanta Daily World,"Dec 3, 1941",19411203,Advertisement,2,Display Ad 2 -- No Title ...,Display Ad 2 -- No Title ...
18493,158413,490673534,Atlanta Daily World,"Dec 3, 1941",19411203,Article,3,Bible Thought ...,Bible Thought ...
18494,158427,490673555,Atlanta Daily World,"Dec 3, 1941",19411203,Article,4,Birmingham NAACP Meets Thursday ...,Birmingham NAACP Meets Thursday ...
18495,158447,490673591,Atlanta Daily World,"Dec 3, 1941",19411203,Image/Photograph,5,"""In The Money"" ...","""In The Money"" ..."


Reorder columns so `text` is not the final column

In [5]:
new_cols = ['RecordID','AlphaPubDate','original_text','text']
compiled = compiled[new_cols]

Get all the punctuation we want to remove

In [6]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [7]:
import unicodedata

text = compiled['text']
punc_marks = {}
for i in range(len(text)):
    for x in text[i]:
        category = unicodedata.category(x)
        if category[0] == 'P':
            punc_marks[x] = 1

all_punctuation = ''.join(punc_marks)
all_punctuation = all_punctuation + 'â' + '€' + '$' + '~' + '^'
print(all_punctuation)

.:,"-*%\?(')!;]/_{[&#}@â€$~^


Obtain stopwords dictionary and save all to a list

In [8]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [9]:
from nltk.corpus import stopwords
en_stops = set(stopwords.words('english'))

Function to remove any non-ascii characters

In [10]:
def remove_non_ascii(string):
    return ''.join(char for char in string if ord(char) < 128)

Set all the words to lower case and remove punctuation, numbers, white spaces and non-ascii characters

In [11]:
text = compiled['text']
for i in range(len(text)):
  graf = '';
  split_text = remove_non_ascii(text[i]).split() #non-ascii
  for word in split_text:
    word = word.lower() #lower case
    word = word.translate(str.maketrans('', '', all_punctuation)) #punctuation
    word = word.strip('0123456789') #numbers
    word = word.strip() #white spaces
    graf = graf + word + ' '
  text[i] = graf

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text[i] = graf


Remove all stop words

In [12]:
for i in range(len(text)):
  graf = '';
  split_text = text[i].split()
  for word in split_text:
    if word not in en_stops:
      graf = graf + word + ' '
  text[i] = graf

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text[i] = graf


Import the `nltk` packages required for stemming. I tried `PorterStemmer`, `LancasterStemmer` and `WordNetLemmatizer` and found the last to be the most accurate.

In [13]:
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

Download the materials required for the packages to run. `WordNetLemmatizer` utilizes the `wordnet` dictionary.

In [14]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

Stem words using the `WordNetLemmatizer`. Only verbs are stemmed as specified by the `pos` parameter in the `lemmatize` function.

In [16]:
ps = PorterStemmer()
ls = LancasterStemmer()
wl = WordNetLemmatizer()

for i in range(len(text)):
  graf = '';
  sentence = text[i]
  words = word_tokenize(sentence)
  for word in words:
      graf = graf + wl.lemmatize(word, pos='v') + ' '
  text[i] = graf

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text[i] = graf


Trying the PorterStemmer...

In [17]:
for i in range(len(text)):
  graf = '';
  sentence = text[i]
  words = word_tokenize(sentence)
  for word in words:
      graf = graf + ps.stem(word) + ' '
  text[i] = graf

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text[i] = graf


Mount Google Drive and allow access in order to save as csv file.

In [18]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Save as a csv file.

In [20]:
compiled.to_csv(r'/content/ATL_1941_text.csv')

Import `collections` package for working with dictionaries.

In [None]:
import collections

Calculate word frequencies for each unique word in the file. Frequency increments by 1 if the word is already in the dictionary; otherwise, the word is added and given a frequency of 1.

In [None]:
unique_words = {}
for i in range(len(text)):
  graf = text[i]
  for word in text[i].split():
    if word in unique_words:
      unique_words[word] += 1
    else:
      unique_words[word] = 1

In [None]:
classify = compiled['entry']
rv_unique_words = {}
for i in range(len(text)):
  if (classify[i] == 1):
    graf = text[i]
    for word in text[i].split():
      if word in rv_unique_words:
        rv_unique_words[word] += 1
      else:
        rv_unique_words[word] = 1

Use the `collections` package to list the 50 most common words.

In [None]:
word_counter = collections.Counter(unique_words)
word_counter.most_common(50)

[('mr', 19545),
 ('j', 12921),
 ('c', 6159),
 ('church', 5534),
 ('meet', 5329),
 ('news', 4762),
 ('r', 4629),
 ('n', 4592),
 ('one', 4527),
 ('w', 4444),
 ('rev', 4429),
 ('street', 4366),
 ('miss', 4068),
 ('sunday', 3962),
 ('l', 3946),
 ('club', 3919),
 ('say', 3705),
 ('make', 3694),
 ('h', 3685),
 ('new', 3661),
 ('school', 3511),
 ('year', 3411),
 ('presid', 3343),
 ('negro', 3283),
 ('u', 3271),
 ('e', 3247),
 ('time', 3221),
 ('atlanta', 3190),
 ('state', 3143),
 ('p', 3093),
 ('b', 2997),
 ('report', 2985),
 ('hold', 2962),
 ('day', 2865),
 ('home', 2864),
 ('pm', 2765),
 ('go', 2703),
 ('night', 2699),
 ('dr', 2654),
 ('v', 2633),
 ('two', 2629),
 ('citi', 2572),
 ('last', 2537),
 ('baptist', 2408),
 ('first', 2348),
 ('man', 2347),
 ('member', 2338),
 ('give', 2325),
 ('st', 2308),
 ('nbc', 2301)]

In [None]:
(compiled['entry'] == 1).sum()

91

In [None]:
word_counter = collections.Counter(rv_unique_words)
word_counter.most_common(100)

[('j', 150),
 ('white', 144),
 ('offic', 131),
 ('say', 123),
 ('polic', 112),
 ('negro', 110),
 ('color', 103),
 ('state', 100),
 ('two', 89),
 ('man', 87),
 ('charg', 87),
 ('lynch', 86),
 ('mr', 84),
 ('n', 80),
 ('make', 78),
 ('case', 78),
 ('c', 78),
 ('report', 76),
 ('court', 72),
 ('one', 71),
 ('new', 71),
 ('mob', 65),
 ('men', 65),
 ('feder', 65),
 ('street', 63),
 ('juri', 60),
 ('citi', 59),
 ('investig', 59),
 ('take', 56),
 ('beat', 55),
 ('last', 53),
 ('r', 51),
 ('u', 50),
 ('right', 50),
 ('night', 48),
 ('find', 48),
 ('peopl', 47),
 ('counti', 46),
 ('continu', 45),
 ('arrest', 45),
 ('w', 45),
 ('death', 44),
 ('shoot', 44),
 ('boy', 43),
 ('day', 42),
 ('jail', 42),
 ('year', 42),
 ('victim', 42),
 ('woman', 41),
 ('sever', 40),
 ('indict', 40),
 ('klan', 39),
 ('three', 39),
 ('come', 39),
 ('home', 39),
 ('member', 38),
 ('time', 38),
 ('citizen', 38),
 ('nation', 38),
 ('ol', 38),
 ('atlanta', 38),
 ('attorney', 37),
 ('naacp', 37),
 ('judg', 37),
 ('trial', 

How many times does the word *black* appear?

In [None]:
unique_words.get('black')

270

In [None]:
rv_unique_words.get('black')

1

In [None]:
rv_unique_words.get('negro')

42

In [None]:
rv_unique_words.get('color')

48

How many times does the word *white* appear?

In [None]:
unique_words.get('white')

1009

In [None]:
rv_unique_words.get('white')

77

for my own use :)

In [None]:
compiled['text'].head(50)

0     weather titl weather fa colder tempera fcnr rf...
1     mr mason speaker forum tonight mr mason speake...
2     georgia death georgia death mr annik klxti cay...
3     jc smith top shaw five jc smith top shaw five ...
4     emot factor disea emot factor disea ftv rnoar ...
5     railway cross colli result death pair railway ...
6     antilynch bill sidelight antilynch bill sideli...
7     david howard hi sponsor richard durant recit d...
8     fear freez record snow coat fear freez record ...
9     henri bracken bracken oa henri bracken pass aw...
10    harlem trivia harlem trivia marguerit l martin...
11    radio program radio program waga yawn patrol o...
12    sam auburn avenu say auburn avenu say ry ip ix...
13    wish well wish hike u littl tame win give y^l ...
14    classifi ad titl classifi rent room h mitehil ...
15    self make girl sbf make l hazel ~f livfngston ...
16    articl titl earc popular fonner assist manag a...
17    belov yearold memphian dead belov year0ld 