# Assignment 7 - Text Processing (Group)
*Daniel Lu, Wanyu Guan, Markus Shriner*

In [1]:
# Import libraries
import re
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import statsmodels.formula as smf
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

## 1. Explore the data


### 1.1 Load the data. You may drop size, lines, and pagenr.

In [2]:
#Load the data 
tx = pd.read_csv("./texts.csv.bz2", sep="\t")
tx.drop(columns=["size", "lines", "pagenr"], inplace=True)
tx.head(5)

Unnamed: 0,name,text
0,balbulus-early-life-charlemagne,\nTitle: Early Lives of Charlemagne by Eginhar...
1,balbulus-early-life-charlemagne,"\n\nThe notes, keyed to line numbers in the so..."
2,balbulus-early-life-charlemagne,\n From a bronze statuette in the Musé...
3,balbulus-early-life-charlemagne,\n _A lui finit la dissolution ...
4,balbulus-early-life-charlemagne,public opinion in regard to the meaning of fal...


### 1.2 Ensure that you don't have any missing name, and empty text in your data.

In [3]:
tx.isna().sum()

name    0
text    1
dtype: int64

In [4]:
tx.dropna(inplace=True)

In [5]:
tx.isna().sum()

name    0
text    0
dtype: int64

### 1.3 Create a summary table where you show how many chunks of each book you have in data. Order this by size.


In [6]:
tx.groupby('name').count().sort_values(by='text', ascending=False)

Unnamed: 0_level_0,text
name,Unnamed: 1_level_1
cia-world-factbook-1992,2822
bible,1321
webster-early-european-history,1265
vaneeden-quest,864
hardy-madding-crowd,723
why-speech-output,680
selected-polish-tales,534
unamuno-tragic-sense-of-life,519
naval-academy-sound-military-decision,485
milton-paradise-lost,466


### 1.4 Explore the data: check out a few pages from various titles, as a minimum take a look how do a few books and a few CS papers look like

In [7]:
tx[tx['name'] == 'bible'][:2]

Unnamed: 0,name,text
477,bible,In the beginning God created the heaven and th...
478,bible,And God made the beast of the earth after his ...


In [8]:
tx[tx.name.str.contains('paper')][:2]

Unnamed: 0,name,text
8380,paper-compact-hash-tables,".EQ\ndelim $$\ndefine <- ?< ""\h'-0.5m'"" up 10 ..."
8381,paper-compact-hash-tables,"define elseif '""\fBelseif\fI""~' \ndefine for '..."


## 2. First Task: Tokenize

### 2.1 Convert all texts to lower case


In [9]:
tx.text = tx.text.str.lower()
tx.text

0        \ntitle: early lives of charlemagne by eginhar...
1        \n\nthe notes, keyed to line numbers in the so...
2        \n         from a bronze statuette in the musé...
3        \n                _a lui finit la dissolution ...
4        public opinion in regard to the meaning of fal...
                               ...                        
12919         descriptive cataloging division lm 540\n ...
12920         james graber\n     information technology...
12921    \n     john w. kimball, jr\n     machine-reada...
12922         (202) 707-7706\n\n     chandru j. shahani...
12923         preservation microfilming office lm g05\n...
Name: text, Length: 12923, dtype: object

### 2.2 Remove punctuation and other weird characters. I recommend to replace these with space.

In [10]:
# tx.text = tx.text.map(lambda x: re.sub(r'\W+', ' ', x))
tx.text = tx.text.map(lambda x: re.sub('[^a-z0-9]+', ' ', x))
# tx.text.map(lambda x: print('$$$' + x))

In [11]:
# punk = "!\"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"
# punk

In [12]:
# tx.text = tx.text.str.translate(str.maketrans(punk, ' '*len(punk))) 

In [13]:
tx.text

0         title early lives of charlemagne by eginhard ...
1         the notes keyed to line numbers in the source...
2         from a bronze statuette in the mus e carnaval...
3         a lui finit la dissolution de l ancien monde ...
4        public opinion in regard to the meaning of fal...
                               ...                        
12919     descriptive cataloging division lm 540 202 70...
12920     james graber information technology services ...
12921     john w kimball jr machine readable collection...
12922     202 707 7706 chandru j shahani preservation r...
12923     preservation microfilming office lm g05 202 7...
Name: text, Length: 12923, dtype: object

### 2.3 Tokenize texts to words. If you replaced punctuation with spaces, you can just use pandas' str.split method.


In [14]:
tx.text = tx.text.str.split()

In [15]:
tx.text

0        [title, early, lives, of, charlemagne, by, egi...
1        [the, notes, keyed, to, line, numbers, in, the...
2        [from, a, bronze, statuette, in, the, mus, e, ...
3        [a, lui, finit, la, dissolution, de, l, ancien...
4        [public, opinion, in, regard, to, the, meaning...
                               ...                        
12919    [descriptive, cataloging, division, lm, 540, 2...
12920    [james, graber, information, technology, servi...
12921    [john, w, kimball, jr, machine, readable, coll...
12922    [202, 707, 7706, chandru, j, shahani, preserva...
12923    [preservation, microfilming, office, lm, g05, ...
Name: text, Length: 12923, dtype: object

### 2.4 Remove stopwords. It is up to you to decide which stopwords to remove, I recommend to include at least the and a.


In [16]:
stop_words = ['ourselves', 'hers', 'between', 'yourself', 'but', 'again', 'there', 'about', 'once',
                  'during', 'out', 'very', 'having', 'with', 'they', 'own', 'an', 'be', 'some', 'for',
                  'do', 'its', 'yours', 'such', 'into', 'of', 'most', 'itself', 'other', 'off', 'is',
                  's', 'am', 'or', 'who', 'as', 'from', 'him', 'each', 'the', 'themselves', 'until',
                  'below', 'are', 'we', 'these', 'your', 'his', 'through', 'don', 'nor', 'me', 'were',
                  'her', 'more', 'himself', 'this', 'down', 'should', 'our', 'their', 'while', 'above',
                  'both', 'up', 'to', 'ours', 'had', 'she', 'all', 'no', 'when', 'at', 'any', 'before',
                  'them', 'same', 'and', 'been', 'have', 'in', 'will', 'on', 'does', 'yourselves', 'then',
                  'that', 'because', 'what', 'over', 'why', 'so', 'can', 'did', 'not', 'now', 'under', 'he',
                  'you', 'herself', 'has', 'just', 'where', 'too', 'only', 'myself', 'which', 'those', 'i',
                  'after', 'few', 'whom', 't', 'being', 'if', 'theirs', 'my', 'against', 'a', 'by', 'doing',
                  'it', 'how', 'further', 'was', 'here', 'than', '\x1a']

tx.text = tx['text'].apply(lambda x: [token for token in x if token not in stop_words])
tx.head(5)

Unnamed: 0,name,text
0,balbulus-early-life-charlemagne,"[title, early, lives, charlemagne, eginhard, m..."
1,balbulus-early-life-charlemagne,"[notes, keyed, line, numbers, source, edition,..."
2,balbulus-early-life-charlemagne,"[bronze, statuette, mus, e, carnavalet, paris,..."
3,balbulus-early-life-charlemagne,"[lui, finit, la, dissolution, de, l, ancien, m..."
4,balbulus-early-life-charlemagne,"[public, opinion, regard, meaning, falsehood, ..."


### 2.5 Create such vocabulary and order it alphabetically

In [17]:
vocab = set()
for i in range(len(tx.text)):
    vocab|=set(tx.text.iat[i])
# vocab |= set(tx.text.iat[i] for i in range(len(tx.text)))
vocab = sorted(vocab)

In [18]:
vocab[:10]

['0',
 '00',
 '000',
 '0000',
 '00000',
 '00000000000test',
 '00006',
 '0001',
 '0002',
 '00021']

## 3. Implement BOW

### Using CountVectorizer


In [19]:
tx.text = tx.text.apply(lambda x: ' '.join(map(str, x)))

In [20]:
tx.text[:5]

0    title early lives charlemagne eginhard monk st...
1    notes keyed line numbers source edition conver...
2    bronze statuette mus e carnavalet paris early ...
3    lui finit la dissolution de l ancien monde lui...
4    public opinion regard meaning falsehood obliga...
Name: text, dtype: object

In [21]:
vrizer = CountVectorizer()
r = vrizer.fit(tx.text) 
X = vrizer.transform(tx.text)
pd.DataFrame(X.toarray(), columns= vrizer.get_feature_names())

Unnamed: 0,00,000,0000,00000,00000000000test,00006,0001,0002,00021,00021053,...,zurbuchen,zurich,zuriel,zurishaddai,zuta,zuzims,zviad,zwingli,zx,zzassgl
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12918,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12919,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12920,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12921,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## 4. Model