# Process 20newsgroups into csv

In [1]:
import pandas
import os

### Create empty dataframe

In [2]:
dataframe = pandas.DataFrame(columns = ["text", "category", "full_path"])

In [3]:
dataframe

Unnamed: 0,text,category,full_path


### Process all files in directory, adding contents to dataframe

In [4]:
partial_path = "/users/danielcorcoran/desktop/github_repos/python_nb_sklearn_gaussian_naive_bayes/20_newsgroups/"

In [5]:
index_counter = 0

for folder in os.listdir(partial_path):
    if folder != ".DS_Store":
        
        #process on the folder level
        folder_path = partial_path + folder + "/"
        
        print("Processing:",folder_path)
        
        for filename in os.listdir(folder_path):
            
            full_path = folder_path + filename
            
            with open(full_path, "r", errors = "ignore") as my_file:
                
                data = my_file.read()
                
            dataframe.loc[index_counter, "text"] = data
            dataframe.loc[index_counter, "category"] = folder
            dataframe.loc[index_counter, "full_path"] = full_path
            
            my_file.close()
                
            index_counter += 1
        

Processing: /users/danielcorcoran/desktop/github_repos/python_nb_sklearn_gaussian_naive_bayes/20_newsgroups/talk.politics.mideast/
Processing: /users/danielcorcoran/desktop/github_repos/python_nb_sklearn_gaussian_naive_bayes/20_newsgroups/rec.autos/
Processing: /users/danielcorcoran/desktop/github_repos/python_nb_sklearn_gaussian_naive_bayes/20_newsgroups/comp.sys.mac.hardware/
Processing: /users/danielcorcoran/desktop/github_repos/python_nb_sklearn_gaussian_naive_bayes/20_newsgroups/alt.atheism/
Processing: /users/danielcorcoran/desktop/github_repos/python_nb_sklearn_gaussian_naive_bayes/20_newsgroups/rec.sport.baseball/
Processing: /users/danielcorcoran/desktop/github_repos/python_nb_sklearn_gaussian_naive_bayes/20_newsgroups/comp.os.ms-windows.misc/
Processing: /users/danielcorcoran/desktop/github_repos/python_nb_sklearn_gaussian_naive_bayes/20_newsgroups/rec.sport.hockey/
Processing: /users/danielcorcoran/desktop/github_repos/python_nb_sklearn_gaussian_naive_bayes/20_newsgroups/sci

In [6]:
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19997 entries, 0 to 19996
Data columns (total 3 columns):
text         19997 non-null object
category     19997 non-null object
full_path    19997 non-null object
dtypes: object(3)
memory usage: 1.2+ MB


### Validate

In [7]:
dataframe["full_path"].nunique()

19997

In [8]:
dataframe["category"].nunique()

20

In [9]:
dataframe["text"].nunique()

19466

In [11]:
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19997 entries, 0 to 19996
Data columns (total 3 columns):
text         19997 non-null object
category     19997 non-null object
full_path    19997 non-null object
dtypes: object(3)
memory usage: 1.2+ MB


### Clean strings

In [15]:
def replace_new_line(string):
    cleaned = string.replace("\n","")
    return cleaned

In [16]:
dataframe["text_cleaned"] = dataframe["text"].apply(replace_new_line)

In [17]:
dataframe

Unnamed: 0,text,category,full_path,text_cleaned
0,Newsgroups: talk.politics.mideast\nPath: canta...,talk.politics.mideast,/users/danielcorcoran/desktop/github_repos/pyt...,Newsgroups: talk.politics.mideastPath: cantalo...
1,Xref: cantaloupe.srv.cs.cmu.edu talk.politics....,talk.politics.mideast,/users/danielcorcoran/desktop/github_repos/pyt...,Xref: cantaloupe.srv.cs.cmu.edu talk.politics....
2,Xref: cantaloupe.srv.cs.cmu.edu talk.politics....,talk.politics.mideast,/users/danielcorcoran/desktop/github_repos/pyt...,Xref: cantaloupe.srv.cs.cmu.edu talk.politics....
3,Path: cantaloupe.srv.cs.cmu.edu!crabapple.srv....,talk.politics.mideast,/users/danielcorcoran/desktop/github_repos/pyt...,Path: cantaloupe.srv.cs.cmu.edu!crabapple.srv....
4,Xref: cantaloupe.srv.cs.cmu.edu soc.culture.ar...,talk.politics.mideast,/users/danielcorcoran/desktop/github_repos/pyt...,Xref: cantaloupe.srv.cs.cmu.edu soc.culture.ar...
5,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,talk.politics.mideast,/users/danielcorcoran/desktop/github_repos/pyt...,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...
6,Xref: cantaloupe.srv.cs.cmu.edu soc.culture.so...,talk.politics.mideast,/users/danielcorcoran/desktop/github_repos/pyt...,Xref: cantaloupe.srv.cs.cmu.edu soc.culture.so...
7,Xref: cantaloupe.srv.cs.cmu.edu talk.politics....,talk.politics.mideast,/users/danielcorcoran/desktop/github_repos/pyt...,Xref: cantaloupe.srv.cs.cmu.edu talk.politics....
8,Path: cantaloupe.srv.cs.cmu.edu!crabapple.srv....,talk.politics.mideast,/users/danielcorcoran/desktop/github_repos/pyt...,Path: cantaloupe.srv.cs.cmu.edu!crabapple.srv....
9,Xref: cantaloupe.srv.cs.cmu.edu talk.politics....,talk.politics.mideast,/users/danielcorcoran/desktop/github_repos/pyt...,Xref: cantaloupe.srv.cs.cmu.edu talk.politics....


### Export


In [18]:
export_path = "/users/danielcorcoran/desktop/github_repos/python_nb_sklearn_gaussian_naive_bayes/20_newsgroups_processed/20_newsgroups.csv"

dataframe.to_csv(export_path, index_label= "row_index")