# Preparing CSVs for Classification

In [1]:
# Necessary Imports
import pandas as pd
import os

In [2]:
# Reading the text data from the directories. Each review is a text file.
# Code is from https://www.kaggle.com/praveenkotha2/end-to-end-text-processing-for-beginners

# If statement prevents doing all this if the csvs already exist
if not (os.path.isfile('data/reviews_train.csv' and 'data/reviews_test.csv')):
    path = 'data/' 
    train_text = []
    train_label = []
    test_text = []
    test_label = []
    train_data_path_pos = os.path.join(path,'train/pos/')
    train_data_path_neg = os.path.join(path,'train/neg/')

    for data in ['train','test']:
        for label in ['pos','neg']:
            for file in sorted(os.listdir(os.path.join(path,data,label))):
                if file.endswith('.txt'):
                    with open(os.path.join(path,data,label,file)) as file_data:
                        if data=='train':
                            train_text.append(file_data.read())
                            train_label.append( 1 if label== 'pos' else 0)
                        else :
                            test_text.append(file_data.read())
                            test_label.append( 1 if label== 'pos' else 0)

    train_df = pd.DataFrame({'Review': train_text, 'Label': train_label})
    test_df = pd.DataFrame({'Review': test_text, 'Label': test_label})
    train_df = train_df.sample(frac=1).reset_index(drop=True)
    test_df = test_df.sample(frac=1).reset_index(drop=True)
    
    train_df.to_csv('reviews_train.csv')
    test_df.to_csv('reviews_test.csv')

# Else statement reads in csvs if they've been prepared
else:
    train_df = pd.read_csv("data/reviews_train.csv", index_col=0)
    test_df = pd.read_csv("data/reviews_test.csv", index_col=0)

### Cleaning

Checking for nulls and duplicates

In [3]:
print(train_df.shape)
print(test_df.shape)

(25000, 2)
(25000, 2)


In [4]:
print(train_df.info())
train_df.head()
# No nulls in train

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25000 entries, 0 to 24999
Data columns (total 2 columns):
Review    25000 non-null object
Label     25000 non-null int64
dtypes: int64(1), object(1)
memory usage: 585.9+ KB
None


Unnamed: 0,Review,Label
0,If I accidentally stumbled across this script ...,0
1,"This film, was one of my childhood favorites a...",1
2,this movie just goes to show that you dont nee...,1
3,This may be one of the worst movies to ever ma...,0
4,"OK I caught this film halfway through, but.oh....",0


In [5]:
print(test_df.info())
test_df.head()
# No nulls in test

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25000 entries, 0 to 24999
Data columns (total 2 columns):
Review    25000 non-null object
Label     25000 non-null int64
dtypes: int64(1), object(1)
memory usage: 585.9+ KB
None


Unnamed: 0,Review,Label
0,"This film is a good companion to Blair Witch, ...",0
1,Stanwyck and Morgan are perfectly cast in what...,1
2,"Coming from the ""druggie"" generation, I though...",1
3,It's a shame that quality actors like Baldwin ...,0
4,"Two years after this short, the last ""Our Gang...",0


In [6]:
# Checking for duplicates in the train set - 96
print("Number of duplicates in the train set: ", train_df.duplicated().sum())
train_dups = train_df[train_df.duplicated(["Review"])]
train_dups.head()

Number of duplicates in the train set:  96


Unnamed: 0,Review,Label
2829,'Dead Letter Office' is a low-budget film abou...,0
3156,German filmmaker Ulli Lommel has managed a tas...,0
3178,This film was so amateurish I could hardly bel...,0
3644,Bette Midler is indescribable in this concert....,1
3778,A have a female friend who is currently being ...,1


In [7]:
# Checking for duplicates in the test set - 199
print("Number of duplicates in the test set: ", test_df.duplicated().sum())
test_dups = test_df[test_df.duplicated(["Review"])]
test_dups.head()

Number of duplicates in the test set:  199


Unnamed: 0,Review,Label
1570,"Please, If you're thinking about renting this ...",0
1829,The Hand of Death aka Countdown in Kung Fu (19...,1
3333,I am insulted and angry over the idea that a s...,0
3903,"""Three"" is a seriously dumb shipwreck movie. M...",0
5028,I remember when I first heard about Jack Frost...,0


In [8]:
# Removing duplicates
train_nodups = train_df.drop_duplicates(keep='first')
print("New length of train set: ", len(train_nodups)) # 24904

test_nodups = test_df.drop_duplicates(keep='first')
print("New length of test set: ", len(test_nodups)) # 24801

New length of train set:  24904
New length of test set:  24801


In [9]:
# Checking the distributions of our target after removing duplicates
print("Train Without Duplicates: \n{} \n".format(
    train_nodups["Label"].value_counts()))
print("Test Without Duplicates: \n{}".format(
    test_nodups["Label"].value_counts()))

# They're still fairly even (train: 12472 vs 12432, test: 12440 vs. 12361)

Train Without Duplicates: 
1    12472
0    12432
Name: Label, dtype: int64 

Test Without Duplicates: 
1    12440
0    12361
Name: Label, dtype: int64


In [10]:
# Creating a df of both train and test to look for duplicates between them
total_df = train_nodups.append(test_nodups)
print("Number of shared reviews between train and test sets: ",
      total_df.duplicated().sum())

# There are 123 reviews that appear in both the train and test sets...

Number of shared reviews between train and test sets:  123


In [11]:
# Creating a list of the duplicated reviews
train_test_dups = total_df[total_df.duplicated(["Review"])]
dup_reviews = train_test_dups["Review"]

In [12]:
# Creating an empty list for dropped items to check my work
dropped_test_index = []

# Removing duplicates between train and test from the test set
# This is a sloppy way to do this, I'll bet there's a better way
for row in test_nodups.index:
    if row in dup_reviews.index:
        dropped_test_index.append(row)
        test_nodups.drop(row, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [13]:
# Sanity check
len(dropped_test_index)

123

In [15]:
# Re-checking distributions and lengths of train and test
print("New length of train set: ", len(train_nodups)) # 24904
print("Train Without Duplicates: \n{} \n".format(
    train_nodups["Label"].value_counts()))

print("New length of test set: ", len(test_nodups)) # 24678
print("Test Without Duplicates: \n{}".format(
    test_nodups["Label"].value_counts()))

# Again, still fairly even (train: 12472 vs. 12432, test: 12412 vs. 12266)

New length of train set:  24904
Train Without Duplicates: 
1    12472
0    12432
Name: Label, dtype: int64 

New length of test set:  24678
Test Without Duplicates: 
1    12412
0    12266
Name: Label, dtype: int64


In [16]:
# # Creating csvs without duplicates
# train_nodups.to_csv('data/clean_train.csv')
# test_nodups.to_csv('data/clean_test.csv')

In [17]:
# Creating a sample of each set
train_sample = train_nodups.sample(n=5000, random_state=123)
test_sample = test_nodups.sample(n=5000, random_state=123)

In [18]:
# Sanity check
print(train_sample.shape)
print(test_sample.shape)

(5000, 2)
(5000, 2)


In [19]:
# Checking the distributions of our target after sampling
print("Train Sample: \n{} \n".format(train_sample["Label"].value_counts()))
print("Test Sample: \n{}".format(test_sample["Label"].value_counts()))

# They're still fairly even (train: 2515 vs. 2485, test: 2556 vs. 2444)

Train Sample: 
0    2515
1    2485
Name: Label, dtype: int64 

Test Sample: 
0    2556
1    2444
Name: Label, dtype: int64


In [20]:
# # Creating clean sample csvs
# train_sample.to_csv('clean_train_sample.csv')
# test_sample.to_csv('clean_test_sample.csv')