#### This file does following:
- Read the tsv files in pandas dataframe
- Replace '- ' in the text with '' to remove - and space between 2 words
- Store the dataframes in a dictionary whose keys are tuple ('year', 'act_no')
- Segment eact act into sentences
- Create and concatenate the dataframe of each act by year into a big dataframe


In [2]:
from nltk.tokenize import PunktSentenceTokenizer
import pandas as pd
import matplotlib.pyplot as plt
import os
import numpy as np
import re

#### Read the tsv file and store in a dataframe

In [11]:
#Read tsv file
df = pd.read_csv("PauliMurrayLabels_part2.tsv", sep='\t', engine='python')
df

Unnamed: 0,Year,Act_No,Act
0,1908,443,AN ACT to Amenp SECTION 142 oF CRIMINAL CODE O...
1,1909,132,"AN ACT to REPEAL SECTION 1295, VoLUME I, Cope ..."
2,1910,373,AN ACT Makxinc it A MISDEMEANOR FOR ANY PERSON...
3,1910,411,AN ACT to AMEND AN Act ENTITLED An Act To EsT...
4,1911,110,AN ACT To PRovwe FoR WorKING ALL ABLE-Boptep M...
5,1912,298,AN ACT tro AMEND AN Act EnxtitLtep An Act to E...
6,1912,320,AN ACT to AMEND Section 2158 oF tHE CopE or La...
7,1912,312,AN ACT to AMEND AN Act EntitLep AN Act TO PRov...
8,1914,291,"AN ACT to Amend Section 104, Volume II, Code o..."
9,1914,392,AN ACT to Provide for the Transfer of Insane I...


In [12]:
#Replace '- ' in the text with '' to remove - and spacebetween 2 words
df_new = df.replace(to_replace ='- ', value ='', regex = True)

In [13]:
#save the dataframes in a dictionary with key as a tuple ('year', 'act_no')
l1 = zip(df_new.Year, df_new.Act_No)
act_dict = dict(zip(l1, df_new.set_index(['Year', 'Act_No']).values.tolist()))

In [14]:
# print dictionary keys
act_dict.keys()

dict_keys([(1908, 443), (1909, 132), (1910, 373), (1910, 411), (1911, 110), (1912, 298), (1912, 320), (1912, 312), (1914, 291), (1914, 392), (1915, 69), (1915, 107), (1916, 508), (1916, 509), (1916, 391), (1917, 20), (1917, 189), (1918, 517), (1918, 501), (1918, 398), (1920, 345), (1920, 541), (1920, 561), (1924, 537), (1924, 550), (1926, 932), (1928, 656), (1930, 1204), (1934, 893)])

#### The following operations takes place in the code below"
- tokenize each act stored in a dictionary and store it in a list
- convert the above list into dataframe
- append each dataframe into a big list (appended data)

In [15]:
# initialize the list that holds the each tokenized act for years 1908 - 1932
appended_data = []

# tokenize each act stored in a dictionary
from nltk.tokenize import sent_tokenize
for key, val in act_dict.items():
    #print(val[0])
    token_list = [] 
    sent_tokenizer = PunktSentenceTokenizer(val[0])
    token_list  = sent_tokenizer.tokenize(val[0])
    sent_df = pd.DataFrame(token_list)
    
    # insert column1(Year) as first column, column2(Act_No) as 2nd column and column3(Act) as 3rd column of the dataframe of 
    # tokenized sentences
    sent_df.insert(0, 'Year', key[0])
    sent_df.insert(1, 'Act_No', key[1])
    sent_df.columns.values[2] = 'Act'
    
    # convert the Acts to lower case
    sent_df['Act'] = sent_df['Act'].str.lower()

    # add each dataframe of tokenized acts in a list
    appended_data.append(sent_df)

  
   


In [16]:
#Make a big dataframe from the list of datframes 
appended_data = pd.concat(appended_data)
appended_data

Unnamed: 0,Year,Act_No,Act
0,1908,443,an act to amenp section 142 of criminal code o...
1,1908,443,section 1.
2,1908,443,be it enacted by the general assembly of the s...
3,1908,443,in the case of any prisoner lawfully in the ch...
4,1908,443,shall be the duty of the prosecuting attorney ...
...,...,...,...
7,1934,893,section 3.
8,1934,893,all acts or parts of acts inconsistent herewit...
9,1934,893,section 4.
10,1934,893,this act shall take effect immediately upon it...


#### Only consider those rows of data whose character lengths are > 10 (to discard rows with entries like "section x, sec x.....")

In [17]:

appended_data = appended_data[appended_data['Act'].str.len() >10]


#### save the acts segmented in sentences in a csv file

In [18]:
appended_data.to_csv("PauliMurraySentences_part2.csv")