In [44]:
# Import all the required packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time

In [45]:
#================================================================================================
# USER INPUT!
# Here I specify which data files need reading in
datafilenames = ["train.csv"] # In this case I only have one file to read in
#================================================================================================

In [46]:
alldataframes = [pd.read_csv(filename) for filename in datafilenames]

In [119]:
# Here are a bunch of functions that are aimed at detecting dirty data and cleaning it

# This function takes a column and does a tally of the different types of entries (int, float, string, etc.)
# It then spits out a tuple of the different types and their relative frequencies in the column
def ratiosOfDifferentTypes(column):
    uniquerows = column.drop_duplicates() #.dropna().drop_duplicates() THIS SHOULD BE FIXED!!! WE NEED TO ALSO DO .dropna() !!!
    uniquerows = uniquerows.sample(n=min(1000,uniquerows.size))
    types = [type(entry) for entry in uniquerows]
    differenttypes = list(set(types))
    tally = [types.count(giventype) for giventype in differenttypes]
    total = sum(tally)*1.
    frequencies = [tallyelement / total for tallyelement in tally]
    return (differenttypes,frequencies)

# This function takes a column and decides which type its entries are meant to be like.
# It returns the type. If the entries are so mixed that it can't decide, it returns object.
def decideType(column):
    typesandratios = ratiosOfDifferentTypes(column)
    if max(typesandratios[1]) >= 0.8:
        # all the rows should probably be of the same type and some have been inputted incorrectly
        correcttype = typesandratios[0][typesandratios[1].index(max(typesandratios[1]))]
    else:
        # the rows have a very mixed type and it's not very clear what the correct type is
        correcttype = object
    return correcttype

# This function goes through all columns in the dataframe and returns the name of the columns that are dirty,
# i.e. that have mixed types of entries.
def findMixedTypes(dataframe):
    return [col for col in dataframe if len(ratiosOfDifferentTypes(dataframe[col])[1])>1]

# This function takes a dataframe and for each column says whether it's clean or dirty. If it's dirty,
# it tries to decide which type it should be.
def analyzeColumnTypes(dataframe):
    mixedtypecolumns = findMixedTypes(dataframe)
    if mixedtypecolumns==[]:
        print "All columns have a single type; they are 'clean'. (They may be incorrect though, or have NaNs)."
    else:
        print "The columns",mixedtypecolumns,"have mixed types:\n"
        correcttypes = [(colname,decideType(dataframe[colname])) for colname in mixedtypecolumns]
        for typ in correcttypes:
            if typ[1]==object:
                print " - \'" + typ[0] + "\'" + " is so mixed it's hard to tell the right type"
            else:
                print " - \'" + typ[0] + "\'" + " should be " + "\'" + typ[1].__name__ + "\'"
        print "\nAll other columns have a single type; they are 'clean'. (They may be incorrect though, or have NaNs)."
    return mixedtypecolumns

def findCleanStringTypes(dataframe):
    return [col for col in dataframe if ratiosOfDifferentTypes(dataframe[col])[0]==[str]]

def outlineNaNs(dataframe):
    totalnumberofNaNs = pd.isnull(dataframe).sum()
    percentageofNaNs = totalnumberofNaNs[totalnumberofNaNs > 0].astype(np.float64) / dataframe.shape[0]
    if len(percentageofNaNs)>0:
        print "Here are the columns with NaNs:"
        for kk in range(len(percentageofNaNs)):
            print " - \'" + percentageofNaNs.index[kk] + "\' has percentage of NaNs: \t" + str(percentageofNaNs[kk]) + " %"
        print "\nNo other columns have NaNs."
    else:
        print "No columns have NaNs."

def stripStartEndSpaces(listofdataframes):
    outputdataframes = listofdataframes
    for ii in range(len(listofdataframes)):
        cleanstringcolumns = findCleanStringTypes(listofdataframes[ii])
        for colname in cleanstringcolumns:
            outputdataframes[0].loc[:,colname] = listofdataframes[0][colname].str.strip()
    return outputdataframes

In [48]:
# Now we're going to print out which columns have dirty entries, i.e. have mixed types,
# and we'll try and guess what those entries should be

mixedtypesindataframes = range(len(alldataframes))
for ii in range(len(alldataframes)):
    print "ANALYZING DATAFRAME FROM " + datafilenames[ii] + ":"
    print "===================================================="
    mixedtypesindataframes[ii] =  analyzeColumnTypes(alldataframes[ii])
    print "-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -"
    outlineNaNs(alldataframes[ii])
    print "===================================================="

ANALYZING DATAFRAME FROM train.csv:
The columns ['Cabin', 'Embarked'] have mixed types:

 - 'Cabin' should be 'str'
 - 'Embarked' is so mixed it's hard to tell the right type

All other columns have a single type; they are 'clean'. (They may be incorrect though, or have NaNs).
-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -
Here are the columns with NaNs:
 - 'Age' has percentage of NaNs: 	0.198653198653 %
 - 'Cabin' has percentage of NaNs: 	0.771043771044 %
 - 'Embarked' has percentage of NaNs: 	0.00224466891134 %

No other columns have NaNs.


In [49]:
# Let's look at the UNIQUE values. 
# We'll go through the dirty entries, as well as the purely-string entries.

for ii in range(len(alldataframes)):
    print "ANALYZING DATAFRAME FROM " + datafilenames[ii] + ":"
    print "----------------------------------------------------"
    cleanstringcolumns = findCleanStringTypes(alldataframes[ii])
    for col in mixedtypesindataframes[ii] + cleanstringcolumns:
        print "Column '" + col + "' has the following unique entries:\n"
        print np.sort(alldataframes[ii][col].unique())
        print

ANALYZING DATAFRAME FROM train.csv:
----------------------------------------------------
Column 'Cabin' has the following unique entries:

[nan 'A10' 'A14' 'A16' 'A19' 'A20' 'A23' 'A24' 'A26' 'A31' 'A32' 'A34'
 'A36' 'A5' 'A6' 'A7' 'B101' 'B102' 'B18' 'B19' 'B20' 'B22' 'B28' 'B3'
 'B30' 'B35' 'B37' 'B38' 'B39' 'B4' 'B41' 'B42' 'B49' 'B5' 'B50'
 'B51 B53 B55' 'B57 B59 B63 B66' 'B58 B60' 'B69' 'B71' 'B73' 'B77' 'B78'
 'B79' 'B80' 'B82 B84' 'B86' 'B94' 'B96 B98' 'C101' 'C103' 'C104' 'C106'
 'C110' 'C111' 'C118' 'C123' 'C124' 'C125' 'C126' 'C128' 'C148' 'C2'
 'C22 C26' 'C23 C25 C27' 'C30' 'C32' 'C45' 'C46' 'C47' 'C49' 'C50' 'C52'
 'C54' 'C62 C64' 'C65' 'C68' 'C7' 'C70' 'C78' 'C82' 'C83' 'C85' 'C86' 'C87'
 'C90' 'C91' 'C92' 'C93' 'C95' 'C99' 'D' 'D10 D12' 'D11' 'D15' 'D17' 'D19'
 'D20' 'D21' 'D26' 'D28' 'D30' 'D33' 'D35' 'D36' 'D37' 'D45' 'D46' 'D47'
 'D48' 'D49' 'D50' 'D56' 'D6' 'D7' 'D9' 'E10' 'E101' 'E12' 'E121' 'E17'
 'E24' 'E25' 'E31' 'E33' 'E34' 'E36' 'E38' 'E40' 'E44' 'E46' 'E49' 'E5

In [50]:
#================================================================================================
# USER INPUT!
# After having run the previous cells you know which columns are clean and dirty.
# Look at the unique values of the dirty ones, given above. Use this information to clean them up

# In each dataframe, there are certain dirty columns that should be numeric
columnsthatshouldbenumeric = [
    ["Survived","Pclass"]
]
# In each column there will be some conventions on how the bad things are written out.
# For each column that should be numeric, we specify a tuple with the info
# (decimaldelimiter (a string),thousanddelimeter (a string), listofstringstoremove (a list))
structureofeachcolumn = [
    [
        (".",",",["-"," ","%"]),
        (".",",",["-"," "])
    ]
]

columnsthatshouldbestrings = [
    ["Name","Sex"]
]

columnsthatshouldbedatetimes = [
    []
]
#================================================================================================

# FROM HERE ON IT'S AUTOMATIC

# This is a test column to see if it all works. CAN BE REMOVED.
# messedupcolumn = pd.Series([1,"200",3,4,5.99,11,"11 , 000 .203","11-2%22"])

# This function takes a column that should be numeric but is all dirty with badly made strings. It removes
# the thousand-delimiters, it replaces the decimaldelimiters with periods, and removes any user-chosen 
# additional set of characters
def turnToNumeric(column,decimaldelimiter=".",thousanddelimeter=",",listofstringstoremove=["-"," ","%"]):
    toremoveregex = str(listofstringstoremove + [thousanddelimeter]).rstrip("]'").lstrip("'[").replace("', '","|")
    numericcolumn = pd.to_numeric(column.astype(str).str.replace(toremoveregex,"").str.replace(decimaldelimiter,"."))
    return numericcolumn

# This function takes a dirty column that should all be strings and turns it into such
def turnToString(column):
    return column.astype(str)

def turnToDate(column):
    return pd.to_datetime(column,dayfirst=True)

for ii in range(len(alldataframes)):
    for (jj,coltofix) in enumerate(columnsthatshouldbestrings[ii]):
        alldataframes[ii].loc[:,coltofix] = turnToString(alldataframes[ii][coltofix])
    
    for (jj,coltofix) in enumerate(columnsthatshouldbedatetimes[ii]):
        alldataframes[ii].loc[:,coltofix] = turnToDate(alldataframes[ii][coltofix])
        
    for (jj,coltofix) in enumerate(columnsthatshouldbenumeric[ii]):
        alldataframes[ii].loc[:,coltofix] = turnToNumeric(alldataframes[ii][coltofix],
                                                    structureofeachcolumn[ii][jj][0],
                                                    structureofeachcolumn[ii][jj][1],
                                                    structureofeachcolumn[ii][jj][2])

In [51]:
#================================================================================================
# USER INPUT!
# Now we need to remove those rows of data that are missing critical information, i.e. remove 
# those rows that have a NaN for something very important.
criticalcolumns = [
    ["Survived","Pclass"]
]
#================================================================================================

# FROM HERE ON IT'S AUTOMATIC

for ii in range(len(alldataframes)):
    alldataframes[ii] = alldataframes[ii].dropna(subset = criticalcolumns[ii])

In [120]:
#================================================================================================
# USER INPUT!
# In the remaining columns there may be some NaNs, which should be replaced with some appropriate value.
# The variable whattodowithnans has the structure of a dictionary for every dataframe, i.e. [{"colname": valueforNaN,...},...]

whattodowithnans = [
    {"Survived": -1, "Pclass": -1,"Name": "Unknown name", "Sex": "Unspecified", 
     "Age": alldataframes[0]["Age"].dropna().mean(), "Ticket": "XXXXXX", 
     "Cabin": "XXX", "Embarked": "X"}
]

#================================================================================================

# FROM HERE ON IT'S AUTOMATIC

for ii in range(len(alldataframes)):
    alldataframes[ii] = alldataframes[ii].fillna(whattodowithnans[ii])

alldataframes = stripStartEndSpaces(alldataframes)

In [53]:
#================================================================================================
# USER INPUT!
# Some times there are duplicate entries. Some columns provide unique identifiers for the rows,
# for identifying unique entries (e.g. email address, full name, etc.). If there is no identifier
# column, we just plug the empty list [] into identifiercolumns.

identifiercolumns = [
    ["PassengerId"]
]

#================================================================================================

# FROM HERE ON IT'S AUTOMATIC

for ii in range(len(alldataframes)):
    if identifiercolumns[ii] != []:
        alldataframes[ii] = alldataframes[ii].drop_duplicates(identifiercolumns[ii])
    else:
        alldataframes[ii] = alldataframes[ii].drop_duplicates()

In [96]:
#================================================================================================
# USER INPUT!
# Some data in our database can be fake data (e.g. data generating from testing whether the database works).
# This should be identified using .unique() on each column, above, and thrown away.
#  - It could be numeric data that is too big, too small, or at some impossible value.
#  - It could be impossible datetime stamps
#  - It could be strings that don't make sense; they might contain, end, or start with something bad.
#    They might be missing some parts in the string, or be too long or too short.

# NUMERIC DATA
alldataframes[0] = alldataframes[0][(alldataframes[0]["Survived"]==-1) | (alldataframes[0]["Survived"]==0) | 
                                    (alldataframes[0]["Survived"]==1)]
alldataframes[0] = alldataframes[0][(alldataframes[0]["Pclass"]==-1) | (alldataframes[0]["Pclass"]==1) | 
                                    (alldataframes[0]["Pclass"]==2) | (alldataframes[0]["Pclass"]==3)]
alldataframes[0] = alldataframes[0][(-0.1 < alldataframes[0]["Age"]) & (alldataframes[0]["Age"] < 120)]
alldataframes[0] = alldataframes[0][(0.0 < alldataframes[0]["Fare"])]

# TIMESTAMP DATA

# STRING DATA
alldataframes[0] = alldataframes[0][(alldataframes[0]["Name"].str.contains("Pinko") & alldataframes[0]["Name"].str.contains("Pallino"))==False]
alldataframes[0] = alldataframes[0][(alldataframes[0]["Ticket"]!="False") & (alldataframes[0]["Ticket"]!="false")]
alldataframes[0] = alldataframes[0][(alldataframes[0]["Cabin"]!="Deck") & (alldataframes[0]["Cabin"]!="deck")]

#================================================================================================

In [159]:
#================================================================================================
# USER INPUT!
# Now the data is essentially clean. It is liekly to still have outliers, and things that don't make sense.
# We'll polish it up even further.

alldataframes[0] = alldataframes[0].drop("PassengerId", 1)


#================================================================================================

In [None]:
#================================================================================================
# USER INPUT!
# Now it's tie to decorate the data with more information that can be gleaned form the data.
# E.g. are there any twins? (were they more likely to survive?). If your siblings or parents survive, 
# are you more likely to? Etc. 

#================================================================================================

In [156]:
#================================================================================================
# This is just data-specific work to see how to polish my data further

#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
idnumbers = list(alldataframes[0]["PassengerId"].unique())
listofpassengers = range(1,alldataframes[0].shape[0]+1)

def complement(list1,list2):
    # Gives all elements in list1 that are not in list2
    return [el for el in list1 if el not in list2]

print complement(idnumbers,listofpassengers)
print complement(listofpassengers,idnumbers)
print len(idnumbers) == alldataframes[0].shape[0]
# From this I gather that the PassengerId is a unique identifier, but its enumeration skips numbers for no reason.
# We can safely remove this column altogether.

[877, 878, 879, 880, 881, 882, 883, 884, 885, 886, 887, 888, 889, 890, 891]
[180, 264, 272, 278, 303, 414, 467, 482, 598, 634, 675, 733, 807, 816, 823]
True


In [174]:
print np.sort(alldataframes[0]["Age"].unique())

[  0.42         0.67         0.75         0.83         0.92         1.           2.
   3.           4.           5.           6.           7.           8.           9.
  10.          11.          12.          13.          14.          14.5
  15.          16.          17.          18.          19.          20.
  20.5         21.          22.          23.          23.5         24.
  24.5         25.          26.          27.          28.          28.5
  29.          29.69911765  30.          30.5         31.          32.
  32.5         33.          34.          34.5         35.          36.
  36.5         37.          38.          39.          40.          40.5
  41.          42.          43.          44.          45.          45.5
  46.          47.          48.          49.          50.          51.          52.
  53.          54.          55.          55.5         56.          57.          58.
  59.          60.          61.          62.          63.          64.          65.
  66.   

In [176]:
alldataframes[0][alldataframes[0]["Age"]<1]

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
78,1,2,"Caldwell, Master. Alden Gates",male,0.83,0,2,248738,29.0,XXX,S
305,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S
469,1,3,"Baclini, Miss. Helene Barbara",female,0.75,2,1,2666,19.2583,XXX,C
644,1,3,"Baclini, Miss. Eugenie",female,0.75,2,1,2666,19.2583,XXX,C
755,1,2,"Hamalainen, Master. Viljo",male,0.67,1,1,250649,14.5,XXX,S
803,1,3,"Thomas, Master. Assad Alexander",male,0.42,0,1,2625,8.5167,XXX,C
831,1,2,"Richards, Master. George Sibley",male,0.83,1,1,29106,18.75,XXX,S


In [180]:
alldataframes[0][alldataframes[0]["Name"].str.contains("Hamalainen") ]

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
247,1,2,"Hamalainen, Mrs. William (Anna)",female,24.0,0,2,250649,14.5,XXX,S
755,1,2,"Hamalainen, Master. Viljo",male,0.67,1,1,250649,14.5,XXX,S


In [184]:
for (ii,jj) in alldataframes[0].groupby("Age"):
    print jj["Name"]
    print ""

803    Thomas, Master. Assad Alexander
Name: Name, dtype: object

755    Hamalainen, Master. Viljo
Name: Name, dtype: object

469    Baclini, Miss. Helene Barbara
644           Baclini, Miss. Eugenie
Name: Name, dtype: object

78       Caldwell, Master. Alden Gates
831    Richards, Master. George Sibley
Name: Name, dtype: object

305    Allison, Master. Hudson Trevor
Name: Name, dtype: object

164       Panula, Master. Eino Viljami
172       Johnson, Miss. Eleanor Ileen
183          Becker, Master. Richard F
381        Nakid, Miss. Maria ("Mary")
386    Goodwin, Master. Sidney Leonard
788         Dean, Master. Bertram Vere
827              Mallet, Master. Andre
Name: Name, dtype: object

7         Palsson, Master. Gosta Leonard
16                  Rice, Master. Eugene
119    Andersson, Miss. Ellis Anna Maria
205           Strom, Miss. Telma Matilda
297         Allison, Miss. Helen Loraine
340       Navratil, Master. Edmond Roger
479             Hirvonen, Miss. Hildur E
530             