In [1]:
# Import all the required packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time

In [2]:
#================================================================================================
# USER INPUT!
# Here I specify which data files need reading in
datafilenames = ["train.csv"] # In this case I only have one file to read in
#================================================================================================

In [3]:
alldataframes = [pd.read_csv(filename) for filename in datafilenames]

In [13]:
# Here are a bunch of functions that are aimed at detecting dirty data and cleaning it

# This function takes a column and does a tally of the different types of entries (int, float, string, etc.)
# It then spits out a tuple of the different types and their relative frequencies in the column
def ratiosOfDifferentTypes(column):
    uniquerows = column.drop_duplicates() #.dropna().drop_duplicates() THIS SHOULD BE FIXED!!! WE NEED TO ALSO DO .dropna() !!!
    uniquerows = uniquerows.sample(n=min(1000,uniquerows.size))
    types = [type(entry) for entry in uniquerows]
    differenttypes = list(set(types))
    tally = [types.count(giventype) for giventype in differenttypes]
    total = sum(tally)*1.
    frequencies = [tallyelement / total for tallyelement in tally]
    return (differenttypes,frequencies)

# This function takes a column and decides which type its entries are meant to be like.
# It returns the type. If the entries are so mixed that it can't decide, it returns object.
def decideType(column):
    typesandratios = ratiosOfDifferentTypes(column)
    if max(typesandratios[1]) >= 0.8:
        # all the rows should probably be of the same type and some have been inputted incorrectly
        correcttype = typesandratios[0][typesandratios[1].index(max(typesandratios[1]))]
    else:
        # the rows have a very mixed type and it's not very clear what the correct type is
        correcttype = object
    return correcttype

# This function goes through all columns in the dataframe and returns the name of the columns that are dirty,
# i.e. that have mixed types of entries.
def findMixedTypes(dataframe):
    return [col for col in dataframe if len(ratiosOfDifferentTypes(dataframe[col])[1])>1]

# This function takes a dataframe and for each column says whether it's clean or dirty. If it's dirty,
# it tries to decide which type it should be.
def analyzeColumnTypes(dataframe):
    mixedtypecolumns = findMixedTypes(dataframe)
    if mixedtypecolumns==[]:
        print "All columns have a single type; they are 'clean'. (They may be incorrect though, or have NaNs)."
    else:
        print "The columns",mixedtypecolumns,"have mixed types:\n"
        correcttypes = [(colname,decideType(dataframe[colname])) for colname in mixedtypecolumns]
        for typ in correcttypes:
            if typ[1]==object:
                print " - \'" + typ[0] + "\'" + " is so mixed it's hard to tell the right type"
            else:
                print " - \'" + typ[0] + "\'" + " should be " + "\'" + typ[1].__name__ + "\'"
        print "\nAll other columns have a single type; they are 'clean'. (They may be incorrect though, or have NaNs)."
    return mixedtypecolumns

def findCleanStringTypes(dataframe):
    return [col for col in dataframe if ratiosOfDifferentTypes(dataframe[col])[0]==[str]]

def outlineNaNs(dataframe):
    totalnumberofNaNs = pd.isnull(dataframe).sum()
    percentageofNaNs = totalnumberofNaNs[totalnumberofNaNs > 0].astype(np.float64) / dataframe.shape[0]
    if len(percentageofNaNs)>0:
        print "Here are the columns with NaNs:"
        for kk in range(len(percentageofNaNs)):
            print " - \'" + percentageofNaNs.index[kk] + "\' has percentage of NaNs: \t" + str(percentageofNaNs[kk]) + " %"
        print "\nNo other columns have NaNs."
    else:
        print "No columns have NaNs."


In [14]:
# Now we're going to print out which columns have dirty entries, i.e. have mixed types,
# and we'll try and guess what those entries should be

mixedtypesindataframes = range(len(alldataframes))
for ii in range(len(alldataframes)):
    print "ANALYZING DATAFRAME FROM " + datafilenames[ii] + ":"
    print "===================================================="
    mixedtypesindataframes[ii] =  analyzeColumnTypes(alldataframes[ii])
    print "-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -"
    outlineNaNs(alldataframes[ii])
    print "===================================================="

ANALYZING DATAFRAME FROM train.csv:
All columns have a single type; they are 'clean'. (They may be incorrect though, or have NaNs).
-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -
No columns have NaNs.


In [15]:
# Let's look at the UNIQUE values. 
# We'll go through the dirty entries, as well as the purely-string entries.

for ii in range(len(alldataframes)):
    print "ANALYZING DATAFRAME FROM " + datafilenames[ii] + ":"
    print "----------------------------------------------------"
    cleanstringcolumns = findCleanStringTypes(alldataframes[ii])
    for col in mixedtypesindataframes[ii] + cleanstringcolumns:
        print "Column '" + col + "' has the following unique entries:\n"
        print np.sort(alldataframes[ii][col].unique())
        print

ANALYZING DATAFRAME FROM train.csv:
----------------------------------------------------
Column 'Name' has the following unique entries:

['Abbing, Mr. Anthony' 'Abbott, Mr. Rossmore Edward'
 'Abbott, Mrs. Stanton (Rosa Hunt)' 'Abelson, Mr. Samuel'
 'Abelson, Mrs. Samuel (Hannah Wizosky)' 'Adahl, Mr. Mauritz Nils Martin'
 'Adams, Mr. John' 'Ahlin, Mrs. Johan (Johanna Persdotter Larsson)'
 'Aks, Mrs. Sam (Leah Rosen)' 'Albimona, Mr. Nassef Cassem'
 'Alexander, Mr. William' 'Alhomaki, Mr. Ilmari Rudolf' 'Ali, Mr. Ahmed'
 'Ali, Mr. William' 'Allen, Miss. Elisabeth Walton'
 'Allen, Mr. William Henry' 'Allison, Master. Hudson Trevor'
 'Allison, Miss. Helen Loraine'
 'Allison, Mrs. Hudson J C (Bessie Waldo Daniels)' 'Allum, Mr. Owen George'
 'Andersen-Jensen, Miss. Carla Christine Nielsine' 'Anderson, Mr. Harry'
 'Andersson, Master. Sigvard Harald Elias'
 'Andersson, Miss. Ebba Iris Alfrida' 'Andersson, Miss. Ellis Anna Maria'
 'Andersson, Miss. Erna Alexandra' 'Andersson, Miss. Ingeborg Con

In [7]:
#================================================================================================
# USER INPUT!
# After having run the previous cells you know which columns are clean and dirty.
# Look at the unique values of the dirty ones, given above. Use this information to clean them up

# In each dataframe, there are certain dirty columns that should be numeric
columnsthatshouldbenumeric = [
    ["Survived","Pclass"]
]
# In each column there will be some conventions on how the bad things are written out.
# For each column that should be numeric, we specify a tuple with the info
# (decimaldelimiter (a string),thousanddelimeter (a string), listofstringstoremove (a list))
structureofeachcolumn = [
    [
        (".",",",["-"," ","%"]),
        (".",",",["-"," "])
    ]
]

columnsthatshouldbestrings = [
    ["Name","Sex"]
]

columnsthatshouldbedatetimes = [
    []
]
#================================================================================================

# FROM HERE ON IT'S AUTOMATIC

# This is a test column to see if it all works. CAN BE REMOVED.
# messedupcolumn = pd.Series([1,"200",3,4,5.99,11,"11 , 000 .203","11-2%22"])

# This function takes a column that should be numeric but is all dirty with badly made strings. It removes
# the thousand-delimiters, it replaces the decimaldelimiters with periods, and removes any user-chosen 
# additional set of characters
def turnToNumeric(column,decimaldelimiter=".",thousanddelimeter=",",listofstringstoremove=["-"," ","%"]):
    toremoveregex = str(listofstringstoremove + [thousanddelimeter]).rstrip("]'").lstrip("'[").replace("', '","|")
    numericcolumn = pd.to_numeric(column.astype(str).str.replace(toremoveregex,"").str.replace(decimaldelimiter,"."))
    return numericcolumn

# This function takes a dirty column that should all be strings and turns it into such
def turnToString(column):
    return column.astype(str)

def turnToDate(column):
    return pd.to_datetime(column,dayfirst=True)

for ii in range(len(alldataframes)):
    for (jj,coltofix) in enumerate(columnsthatshouldbestrings[ii]):
        alldataframes[ii].loc[:,coltofix] = turnToString(alldataframes[ii][coltofix])
    
    for (jj,coltofix) in enumerate(columnsthatshouldbedatetimes[ii]):
        alldataframes[ii].loc[:,coltofix] = turnToDate(alldataframes[ii][coltofix])
        
    for (jj,coltofix) in enumerate(columnsthatshouldbenumeric[ii]):
        alldataframes[ii].loc[:,coltofix] = turnToNumeric(alldataframes[ii][coltofix],
                                                    structureofeachcolumn[ii][jj][0],
                                                    structureofeachcolumn[ii][jj][1],
                                                    structureofeachcolumn[ii][jj][2])

In [8]:
#================================================================================================
# USER INPUT!
# Now we need to remove those rows of data that are missing critical information, i.e. remove 
# those rows that have a NaN for something very important.
criticalcolumns = [
    ["Survived","Pclass"]
]
#================================================================================================

# FROM HERE ON IT'S AUTOMATIC

for ii in range(len(alldataframes)):
    alldataframes[ii] = alldataframes[ii].dropna(subset = criticalcolumns[ii])

In [9]:
#================================================================================================
# USER INPUT!
# In the remaining columns there may be some NaNs, which should be replaced with some appropriate value.
# The variable whattodowithnans has the structure of a dictionary for every dataframe, i.e. [{"colname": valueforNaN,...},...]

whattodowithnans = [
    {"Survived": -1, "Pclass": -1,"Name": "Unknown name", "Sex": "Unspecified", 
     "Age": alldataframes[0]["Age"].dropna().mean(), "Ticket": "XXXXXX", 
     "Cabin": "XXX", "Embarked": "X"}
]

#================================================================================================

# FROM HERE ON IT'S AUTOMATIC

for ii in range(len(alldataframes)):
    alldataframes[ii] = alldataframes[ii].fillna(whattodowithnans[ii])

In [16]:
#================================================================================================
# USER INPUT!
# Some times there are duplicate entries. Some columns provide unique identifiers for the rows,
# for identifying unique entries (e.g. email address, full name, etc.). If there is no identifier
# column, we just plug the empty list [] into identifiercolumns.

identifiercolumns = [
    ["PassengerId"]
]

#================================================================================================

# FROM HERE ON IT'S AUTOMATIC

for ii in range(len(alldataframes)):
    if identifiercolumns[ii] != []:
        alldataframes[ii] = alldataframes[ii].drop_duplicates(identifiercolumns[ii])
    else:
        alldataframes[ii] = alldataframes[ii].drop_duplicates()

In [None]:
#================================================================================================
# USER INPUT!
# Some data in our database can be fake data (e.g. data generating from testing whether the database works).
# 
# 

In [None]:

# Now need to the remove fake data / test data. Determine some condition on each column, e.g. if numeric column, then perhaps
# > or < or = to some value means it's bad data. Same for datetime. If it's a string column,
# could say the condition for bad data is e.g. contains, ends, start, is missing, length, ...


In [25]:
alldataframes[0].head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
