# Data Cleaning Example 2: University & Location

## Import

In [1]:
import pandas as pd
import numpy as np

## 1) Extract data from txt

In [6]:
university_towns = []

with open('Datasets/university_towns.txt') as file:
     for line in file:
        if '[edit]' in line:
            # Remember this `state` until the next is found
            state = line                    
        else:
            # Otherwise, we have a city; keep `state` as last-seen
            university_towns.append((state, line))

university_towns[:5]

[('Alabama[edit]\n', 'Auburn (Auburn University)[1]\n'),
 ('Alabama[edit]\n', 'Florence (University of North Alabama)\n'),
 ('Alabama[edit]\n', 'Jacksonville (Jacksonville State University)[2]\n'),
 ('Alabama[edit]\n', 'Livingston (University of West Alabama)[2]\n'),
 ('Alabama[edit]\n', 'Montevallo (University of Montevallo)[2]\n')]

## 2) Convert to data frame

In [7]:
towns_df = pd.DataFrame(university_towns,
                        columns=['State', 'RegionName'])

In [8]:
towns_df

Unnamed: 0,State,RegionName
0,Alabama[edit]\n,Auburn (Auburn University)[1]\n
1,Alabama[edit]\n,Florence (University of North Alabama)\n
2,Alabama[edit]\n,Jacksonville (Jacksonville State University)[2]\n
3,Alabama[edit]\n,Livingston (University of West Alabama)[2]\n
4,Alabama[edit]\n,Montevallo (University of Montevallo)[2]\n
...,...,...
512,Wisconsin[edit]\n,River Falls (University of Wisconsinâ€“River F...
513,Wisconsin[edit]\n,Stevens Point (University of Wisconsinâ€“Steve...
514,Wisconsin[edit]\n,Waukesha (Carroll University)\n
515,Wisconsin[edit]\n,Whitewater (University of Wisconsinâ€“Whitewat...


## 3) Cleaning the Entire Dataset Using the applymap Function

### Create a function 

In [9]:
def get_citystate(item):
    if ' (' in item:
        return item[:item.find(' (')]
    elif '[' in item:
         return item[:item.find('[')]
    else:
        return item

In [10]:
towns_df =  towns_df.applymap(get_citystate)

In [11]:
towns_df

Unnamed: 0,State,RegionName
0,Alabama,Auburn
1,Alabama,Florence
2,Alabama,Jacksonville
3,Alabama,Livingston
4,Alabama,Montevallo
...,...,...
512,Wisconsin,River Falls
513,Wisconsin,Stevens Point
514,Wisconsin,Waukesha
515,Wisconsin,Whitewater


The applymap() method took each element from the DataFrame, passed it to the function, and the original value was replaced by the returned value. 