In [7]:
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind

In [497]:
# Use this dictionary to map state names to two letter acronyms
states = {'OH': 'Ohio', 'KY': 'Kentucky', 'AS': 'American Samoa', 'NV': 'Nevada', 'WY': 'Wyoming', 'NA': 'National', 'AL': 'Alabama', 'MD': 'Maryland', 'AK': 'Alaska', 'UT': 'Utah', 'OR': 'Oregon', 'MT': 'Montana', 'IL': 'Illinois', 'TN': 'Tennessee', 'DC': 'District of Columbia', 'VT': 'Vermont', 'ID': 'Idaho', 'AR': 'Arkansas', 'ME': 'Maine', 'WA': 'Washington', 'HI': 'Hawaii', 'WI': 'Wisconsin', 'MI': 'Michigan', 'IN': 'Indiana', 'NJ': 'New Jersey', 'AZ': 'Arizona', 'GU': 'Guam', 'MS': 'Mississippi', 'PR': 'Puerto Rico', 'NC': 'North Carolina', 'TX': 'Texas', 'SD': 'South Dakota', 'MP': 'Northern Mariana Islands', 'IA': 'Iowa', 'MO': 'Missouri', 'CT': 'Connecticut', 'WV': 'West Virginia', 'SC': 'South Carolina', 'LA': 'Louisiana', 'KS': 'Kansas', 'NY': 'New York', 'NE': 'Nebraska', 'OK': 'Oklahoma', 'FL': 'Florida', 'CA': 'California', 'CO': 'Colorado', 'PA': 'Pennsylvania', 'DE': 'Delaware', 'NM': 'New Mexico', 'RI': 'Rhode Island', 'MN': 'Minnesota', 'VI': 'Virgin Islands', 'NH': 'New Hampshire', 'MA': 'Massachusetts', 'GA': 'Georgia', 'ND': 'North Dakota', 'VA': 'Virginia'}

For the function get_list_of_university towns, the data is in the text file university_towns.txt is in the following format:
StateOne[edit]
RegionOne (UniversityOne)[1]
RegionTwo (UniversityTwo)
RegionThree (UniversityThree)[2]
The question asks for this data to be cleaned up and returned in a DataFrame of this format:
State
RegionName
0
StateOne
RegionOne
1
StateOne
RegionTwo
2
StateOne
RegionThree
So for example if you have:
Alabama[edit]
Auburn (Auburn University)[1]
Florence (University of North Alabama)
Jacksonville (Jacksonville State University)[2]
This needs to be converted into the dataframe:
State
RegionName
0
Alabama
Auburn
1
Alabama
Florence
2
Alabama
Jacksonville
Some tips for cleaning the data:
1. For "State", removing characters from "[" to the end.
2. For "RegionName", when applicable, removing every character from " (" to the end.
3. Depending on how you read the data, you may need to remove newline character '\n'.
There may be a few RegionNames in the list that don't match up well to actual cities, but don't worry about doing anything special for these entries for the purposes of this assignment. Punctuation at the end of the RegionName not encapsulated in the parenthesis does not need to be removed.
An example:
The RegionName given to us by the text file:
The Five College Region of Western Massachusetts:
Should be unmodified:
State
RegionName
184
Massachusetts
The Five College Region of Western Massachusetts:


In [668]:
def get_list_of_university_towns():
    #'''Returns a DataFrame of towns and the states they are in from the 
    #university_towns.txt list. The format of the DataFrame should be:
    #DataFrame( [ ["Michigan", "Ann Arbor"], ["Michigan", "Yipsilanti"] ], 
    #columns=["State", "RegionName"]  )
    
    #The following cleaning needs to be done:

    #1. For "State", removing characters from "[" to the end.
    #2. For "RegionName", when applicable, removing every character from " (" to the end.
    #3. Depending on how you read the data, you may need to remove newline character '\n'. '''
    
    #data = pd.read_csv('university_towns.txt', sep=r'\[2\]', header=None)
    data = pd.read_csv('university_towns.txt', sep=r'^.+\["edit"\]' , header=None, engine='python')
    data.columns = ["RegionName"]
    
    #data['State'] = data['RegionName'].str.endswith(r'\["edit"\]')
    data['State'] = data['RegionName'].where(data['RegionName'].str.endswith('[edit]'))
    
    #data.State.fillna(method='ffill')
    data['State'] = data['State'].fillna(method='ffill')
         
    data['State'] = data['State'].str.replace('\[.*\]','')
    data['RegionName'] = data['RegionName'].str.replace('\[.*\]','')
    data['RegionName'] = data['RegionName'].str.replace('\(.*\)','')
    
    data.drop(data['RegionName'] == data['State'].index,inplace=True)
    data.reset_index(inplace=True,drop=True)
    
    data['State'] = data['State'].str.strip()
    data['RegionName'] = data['RegionName'].str.strip()
    cols = ['State','RegionName']
    data = data[cols]
    data.set_index('State',inplace=True)
    return data.to_records().tolist()

In [669]:
df = get_list_of_university_towns()
# problem 157
df

[('Alabama', 'Auburn'),
 ('Alabama', 'Florence'),
 ('Alabama', 'Jacksonville'),
 ('Alabama', 'Livingston'),
 ('Alabama', 'Montevallo'),
 ('Alabama', 'Troy'),
 ('Alabama', 'Tuscaloosa'),
 ('Alabama', 'Tuskegee'),
 ('Alaska', 'Alaska'),
 ('Alaska', 'Fairbanks'),
 ('Arizona', 'Arizona'),
 ('Arizona', 'Flagstaff'),
 ('Arizona', 'Tempe'),
 ('Arizona', 'Tucson'),
 ('Arkansas', 'Arkansas'),
 ('Arkansas', 'Arkadelphia'),
 ('Arkansas', 'Conway'),
 ('Arkansas', 'Fayetteville'),
 ('Arkansas', 'Jonesboro'),
 ('Arkansas', 'Magnolia'),
 ('Arkansas', 'Monticello'),
 ('Arkansas', 'Russellville'),
 ('Arkansas', 'Searcy'),
 ('California', 'California'),
 ('California', 'Angwin'),
 ('California', 'Arcata'),
 ('California', 'Berkeley'),
 ('California', 'Chico'),
 ('California', 'Claremont'),
 ('California', 'Cotati'),
 ('California', 'Davis'),
 ('California', 'Irvine'),
 ('California', 'Isla Vista'),
 ('California', 'University Park, Los Angeles'),
 ('California', 'Merced'),
 ('California', 'Orange'),
 ('

In [495]:
def read_city_zhvi_allhomes():
    city = pd.read_csv('City_Zhvi_AllHomes.csv')
    return city

In [342]:
def read_gdp_file():
    gdp_data = pd.ExcelFile('gdplev.xls')
    gdp_data.sheet_names
    gdp = gdp_data.parse("Sheet1")
    gdp = gdp.dropna(axis=1,how='all')
    gdp = gdp.dropna(axis=0,how='all')    
    gdp.columns = ['Period-A','GDP1-A','GDP2-A','Period-Q','GDP1-Q','GDP2-Q']
   
    gdp.drop(gdp.index[:3], inplace=True)
    gdp.reset_index(inplace=True,drop=True)
    
    return gdp 


In [350]:
df = read_city_zhvi_allhomes()
df.head()
df.set_index?

In [524]:
def convert_housing_data_to_quarters():
    '''Converts the housing data to quarters and returns it as mean 
    values in a dataframe. This dataframe should be a dataframe with
    columns for 2000q1 through 2016q3, and should have a multi-index
    in the shape of ["State","RegionName"].
    
    A quarter is a specific three month period, Q1 is January through March,
    Q2 is April through June, Q3 is July through September, Q4 is October through December.
    The resulting dataframe should have 67 columns, and 10,730 rows.
    '''
    
    data = pd.read_csv('City_Zhvi_AllHomes.csv')
    data.replace({"State": states},inplace=True)
    data.set_index(['State','RegionName'],drop=True, inplace=True)
    
    for i in range(1997,2000):
        for j in range(1,13):
            if ( j < 10):
                j = '0' + str(j)
            data.drop(str(i) +'-'+ str(j),axis=1,inplace=True)
            
    for j in range(4,13):
        if ( j < 10):
                j = '0' + str(j)
        data.drop('1996' +'-'+ str(j),axis=1,inplace=True)

    for i in range(2000,2016):
        data[str(i)+"q1"] = data[[str(i)+'-'+str('01'),str(i)+'-'+str('02'),str(i)+'-'+str('03')]].mean()
        data[str(i)+"q2"] = data[[str(i)+'-'+str('04'),str(i)+'-'+str('05'),str(i)+'-'+str('06')]].mean()
        data[str(i)+"q3"] = data[[str(i)+'-'+str('07'),str(i)+'-'+str('08'),str(i)+'-'+str('09')]].mean()
        data[str(i)+"q4"] = data[[str(i)+'-'+str('10'),str(i)+'-'+str('11'),str(i)+'-'+str('12')]].mean()
        
    data[str(2016)+"q1"] = data[[str(2016)+'-'+str('01'),str(2016)+'-'+str('02'),str(2016)+'-'+str('03')]].mean()   
    data[str(2016)+"q2"] = data[[str(2016)+'-'+str('04'),str(2016)+'-'+str('05'),str(2016)+'-'+str('06')]].mean()   
    data[str(2016)+"q3"] = data[[str(2016)+'-'+str('07'),str(2016)+'-'+str('08')]].mean()   
    
    for i in range(2000,2016):
        for j in range(1,13):
            if ( j < 10):
                j = '0' + str(j)
            data.drop(str(i) +'-'+ str(j),axis=1,inplace=True)
            
    for j in range(1,9):
        if ( j < 10):
            j = '0' + str(j)
        data.drop(str(2016) +'-'+ str(j),axis=1,inplace=True)   
    data.drop(['RegionID','Metro','CountyName','SizeRank'],axis=1,inplace=True)        
    return data

In [526]:
data = convert_housing_data_to_quarters()
#convert_housing_data_to_quarters().loc["Texas"].loc["Austin"].loc["2010q3"]
#s = data.loc["New York"].loc["New York"].loc["2001q1"]

In [648]:
def test():
    data = pd.read_csv('university_towns.txt', sep=r'^.+\["edit"\]' , header=None, engine='python')
    data.columns = ["RegionName"]
    
    #data['State'] = data['RegionName'].str.endswith(r'\["edit"\]')
    data['State'] = data['RegionName'].where(data['RegionName'].str.endswith('[edit]'))
    
    #data.State.fillna(method='ffill')
    data['State'] = data['State'].fillna(method='ffill')
    data['State'] = data['State'].str.replace('\[.*\]','')
    data['RegionName'] = data['RegionName'].str.replace('\[.*\]','')
    data['RegionName'] = data['RegionName'].str.replace('\(.*\)','')
    
    data.drop(data['RegionName'] == data['State'].index,inplace=True)
    data.reset_index(inplace=True,drop=True)
    
    data['State'] = data['State'].str.strip()
    data['RegionName'] = data['RegionName'].str.strip()
    cols = ['State','RegionName']
    data = data[cols]
    
    #data['State'].loc[data['RegionName'] == data['State']]
    return data