# Assignment 7 - Text Processing (Group)
*Daniel Lu, Wanyu Guan, Markus Shriner*

In [127]:
# Import libraries
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import statsmodels.formula as smf
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

## 1. Explore the data


### 1.1 Load the data. You may drop size, lines, and pagenr.

In [110]:
#Load the data 
tx = pd.read_csv("./texts.csv.bz2", sep="\t")
tx.drop(columns=["size", "lines", "pagenr"], inplace=True)
tx.head(5)

Unnamed: 0,name,text
0,balbulus-early-life-charlemagne,\nTitle: Early Lives of Charlemagne by Eginhar...
1,balbulus-early-life-charlemagne,"\n\nThe notes, keyed to line numbers in the so..."
2,balbulus-early-life-charlemagne,\n From a bronze statuette in the Musé...
3,balbulus-early-life-charlemagne,\n _A lui finit la dissolution ...
4,balbulus-early-life-charlemagne,public opinion in regard to the meaning of fal...


### 1.2 Ensure that you don't have any missing name, and empty text in your data.

In [111]:
tx.isna().sum()

name    0
text    1
dtype: int64

In [112]:
tx.dropna(inplace=True)

In [113]:
tx.isna().sum()

name    0
text    0
dtype: int64

### 1.3 Create a summary table where you show how many chunks of each book you have in data. Order this by size.


In [114]:
tx.text.iat[0]

'\nTitle: Early Lives of Charlemagne by Eginhard and the Monk of St Gall\n       edited by Prof. A. J. Grant\nAuthor: Einhard and Notker Balbulus\nRelease Date: May 03, 2015 [EBook #48870]\nLanguage: English\nCharacter set encoding: UTF-8\n\n\n*** START OF THIS PROJECT GUTENBERG EBOOK EARLY LIVES OF CHARLEMAGNE ***\n\n\n\n\nProduced by Andrew Dunning.\n\nCreated from scans by Robarts Library, University of Toronto, available\nthrough the Internet Archive.\n\n\n\n\nTranscriber’s Note'

## 2. First Task: Tokenize

### 2.1 Convert all texts to lower case


In [115]:
tx.text = tx.text.str.lower()
tx.text

0        \ntitle: early lives of charlemagne by eginhar...
1        \n\nthe notes, keyed to line numbers in the so...
2        \n         from a bronze statuette in the musé...
3        \n                _a lui finit la dissolution ...
4        public opinion in regard to the meaning of fal...
                               ...                        
12919         descriptive cataloging division lm 540\n ...
12920         james graber\n     information technology...
12921    \n     john w. kimball, jr\n     machine-reada...
12922         (202) 707-7706\n\n     chandru j. shahani...
12923         preservation microfilming office lm g05\n...
Name: text, Length: 12923, dtype: object

### 2.2 Remove punctuation and other weird characters. I recommend to replace these with space.

In [116]:
punk = "!\"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"
punk

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [117]:
tx.text = tx.text.str.translate(str.maketrans(punk, ' '*len(punk))) 

In [118]:
tx.text

0        \ntitle  early lives of charlemagne by eginhar...
1        \n\nthe notes  keyed to line numbers in the so...
2        \n         from a bronze statuette in the musé...
3        \n                 a lui finit la dissolution ...
4        public opinion in regard to the meaning of fal...
                               ...                        
12919         descriptive cataloging division lm 540\n ...
12920         james graber\n     information technology...
12921    \n     john w  kimball  jr\n     machine reada...
12922          202  707 7706\n\n     chandru j  shahani...
12923         preservation microfilming office lm g05\n...
Name: text, Length: 12923, dtype: object

### 2.3 Tokenize texts to words. If you replaced punctuation with spaces, you can just use pandas' str.split method.


In [125]:
tx.text = tx.text.str.split()

In [126]:
tx.text

0        [title, early, lives, of, charlemagne, by, egi...
1        [the, notes, keyed, to, line, numbers, in, the...
2        [from, a, bronze, statuette, in, the, musée, c...
3        [a, lui, finit, la, dissolution, de, l’ancien,...
4        [public, opinion, in, regard, to, the, meaning...
                               ...                        
12919    [descriptive, cataloging, division, lm, 540, 2...
12920    [james, graber, information, technology, servi...
12921    [john, w, kimball, jr, machine, readable, coll...
12922    [202, 707, 7706, chandru, j, shahani, preserva...
12923    [preservation, microfilming, office, lm, g05, ...
Name: text, Length: 12923, dtype: object

### 2.4 Remove stopwords. It is up to you to decide which stopwords to remove, I recommend to include at least the and a.
