In [1]:
import pandas as pd

## dataframe vectorized methods

In [2]:
df = pd.read_csv("sales.csv", index_col="month")

In [3]:
df

Unnamed: 0_level_0,eggs,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Jan,47,12.0,17
Feb,110,50.0,31
Mar,221,89.0,72
Apr,77,87.0,20
May,132,,52
Jun,205,60.0,55



### convert number of sold eggs to dozerns

In [4]:
df.floordiv(12)

Unnamed: 0_level_0,eggs,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Jan,3,1.0,1
Feb,9,4.0,2
Mar,18,7.0,6
Apr,6,7.0,1
May,11,,4
Jun,17,5.0,4


### numpy std functions

In [5]:
import numpy as np

In [8]:
#same with numpy standard functtions
np.floor_divide(df, 12) # convert to dozens of units

  


Unnamed: 0_level_0,eggs,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Jan,3.0,1.0,1.0
Feb,9.0,4.0,2.0
Mar,18.0,7.0,6.0
Apr,6.0,7.0,1.0
May,11.0,,4.0
Jun,17.0,5.0,4.0


### using custom functions

In [16]:
def dozens(n):
    return n//12

In [17]:
df.apply(dozens)

Unnamed: 0_level_0,eggs,salt,spam,dozens_of_eggs
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Jan,3,1.0,1,0
Feb,9,4.0,2,0
Mar,18,7.0,6,1
Apr,6,7.0,1,0
May,11,,4,0
Jun,17,5.0,4,1


### using of lambda functions

In [18]:
df.apply(lambda n: n//12)

Unnamed: 0_level_0,eggs,salt,spam,dozens_of_eggs
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Jan,3,1.0,1,0
Feb,9,4.0,2,0
Mar,18,7.0,6,1
Apr,6,7.0,1,0
May,11,,4,0
Jun,17,5.0,4,1


## storing transformations

In [19]:
#all the computations do not affect the source df
#to store the computations, we need to assign new values to new column
df["dozens_of_eggs"] = df.eggs.floordiv(12)

In [20]:
df.columns

Index(['eggs', 'salt', 'spam', 'dozens_of_eggs'], dtype='object')

## dataframe index (string operations)

In [21]:
df

Unnamed: 0_level_0,eggs,salt,spam,dozens_of_eggs
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Jan,47,12.0,17,3
Feb,110,50.0,31,9
Mar,221,89.0,72,18
Apr,77,87.0,20,6
May,132,,52,11
Jun,205,60.0,55,17


In [22]:
df.index

Index(['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun'], dtype='object', name='month')

In [24]:
# ta access the string value for the index (or other string object) we need to use .str accessor
df.index = df.index.str.upper()

In [25]:
df

Unnamed: 0_level_0,eggs,salt,spam,dozens_of_eggs
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
JAN,47,12.0,17,3
FEB,110,50.0,31,9
MAR,221,89.0,72,18
APR,77,87.0,20,6
MAY,132,,52,11
JUN,205,60.0,55,17


## methods for index

for the index there is no apply() method, we need to use map method instead 

In [26]:
df.index = df.index.map(str.lower)

### defining columns using other columns values

In [27]:
df["salty_eggs"] = df.salt + df.dozens_of_eggs

In [28]:
df

Unnamed: 0_level_0,eggs,salt,spam,dozens_of_eggs,salty_eggs
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
jan,47,12.0,17,3,15.0
feb,110,50.0,31,9,59.0
mar,221,89.0,72,18,107.0
apr,77,87.0,20,6,93.0
may,132,,52,11,
jun,205,60.0,55,17,77.0


#### Using apply() to transform a column
The .apply() method can be used on a pandas DataFrame to apply an arbitrary Python function to every element. In this exercise you'll take daily weather data in Pittsburgh in 2013 obtained from Weather Underground.

A function to convert degrees Fahrenheit to degrees Celsius has been written for you. Your job is to use the .apply() method to perform this conversion on the 'Mean TemperatureF' and 'Mean Dew PointF' columns of the weather DataFrame.

- Apply the to_celsius() function over the ['Mean TemperatureF','Mean Dew PointF'] columns of the weather DataFrame.
- Reassign the column labels of df_celsius to ['Mean TemperatureC','Mean Dew PointC'] using the .columns attribute.
- Hit 'Submit Answer' to see the new DataFrame with the converted units.

In [43]:
weather = pd.read_csv("pittsburgh2013.csv", index_col="Date")

In [44]:
weather.columns

Index(['Max TemperatureF', 'Mean TemperatureF', 'Min TemperatureF',
       'Max Dew PointF', 'Mean Dew PointF', 'Min DewpointF', 'Max Humidity',
       'Mean Humidity', 'Min Humidity', 'Max Sea Level PressureIn',
       'Mean Sea Level PressureIn', 'Min Sea Level PressureIn',
       'Max VisibilityMiles', 'Mean VisibilityMiles', 'Min VisibilityMiles',
       'Max Wind SpeedMPH', 'Mean Wind SpeedMPH', 'Max Gust SpeedMPH',
       'PrecipitationIn', ' CloudCover', 'Events', 'WindDirDegrees'],
      dtype='object')

In [45]:
# Write a function to convert degrees Fahrenheit to degrees Celsius: to_celsius
def to_celsius(F):
    return 5/9*(F - 32)

In [48]:
# Apply the function over 'Mean TemperatureF' and 'Mean Dew PointF': df_celsius
df_celsius = weather[['Mean TemperatureF','Mean Dew PointF']].apply(to_celsius)

In [49]:
# Reassign the column labels of df_celsius
df_celsius.columns = ['Mean TemperatureC', 'Mean Dew PointC']

In [52]:
# Print the output of df_celsius.head()
df_celsius.head()

Unnamed: 0_level_0,Mean TemperatureC,Mean Dew PointC
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2013-1-1,-2.222222,-2.777778
2013-1-2,-6.111111,-11.111111
2013-1-3,-4.444444,-9.444444
2013-1-4,-2.222222,-7.222222
2013-1-5,-1.111111,-6.666667


#### Using .map() with a dictionary
The .map() method is used to transform values according to a Python dictionary look-up. In this exercise you'll practice this method while returning to working with the election DataFrame, which has been pre-loaded for you.

Your job is to use a dictionary to map the values 'Obama' and 'Romney' in the 'winner' column to the values 'blue' and 'red', and assign the output to the new column 'color'

- Create a dictionary with the key:value pairs 'Obama':'blue' and 'Romney':'red'.
- Use the .map() method on the 'winner' column using the red_vs_blue dictionary you created.
- Print the output of election.head(). This has been done for you, so hit 'Submit Answer' to see the new column!

In [53]:
filename = "pennsylvania2012_turnout.csv"
# Read in filename and set the index: election
election = pd.read_csv(filename, index_col='county')


In [55]:
# Create the dictionary: red_vs_blue
red_vs_blue = {"Obama":"blue", "Romney":"red"}

In [56]:
# Use the dictionary to map the 'winner' column to the new column: election['color']
election['color'] = election["winner"].map(red_vs_blue)

In [57]:
election.head()

Unnamed: 0_level_0,state,total,Obama,Romney,winner,voters,turnout,margin,color
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Adams,PA,41973,35.482334,63.112001,Romney,61156,68.632677,27.629667,red
Allegheny,PA,614671,56.640219,42.18582,Obama,924351,66.497575,14.454399,blue
Armstrong,PA,28322,30.696985,67.901278,Romney,42147,67.19814,37.204293,red
Beaver,PA,80015,46.032619,52.63763,Romney,115157,69.483401,6.605012,red
Bedford,PA,21444,22.057452,76.98657,Romney,32189,66.619031,54.929118,red


### Using vectorized functions
When performance is paramount, you should avoid using .apply() and .map() because those constructs perform Python for-loops over the data stored in a pandas Series or DataFrame. By using vectorized functions instead, you can loop over the data at the same speed as compiled code (C, Fortran, etc.)! NumPy, SciPy and pandas come with a variety of vectorized functions (called Universal Functions or UFuncs in NumPy).

You can even write your own vectorized functions, but for now we will focus on the ones distributed by NumPy and pandas.

In this exercise you're going to import the zscore function from scipy.stats and use it to compute the deviation in voter turnout in Pennsylvania from the mean in fractions of the standard deviation. In statistics, the z-score is the number of standard deviations by which an observation is above the mean - so if it is negative, it means the observation is below the mean.

Instead of using .apply() as you did in the earlier exercises, the zscore UFunc will take a pandas Series as input and return a NumPy array. You will then assign the values of the NumPy array to a new column in the DataFrame. You will be working with the election DataFrame - it has been pre-loaded for you.

- Import zscore from scipy.stats.
- Call zscore with election['turnout'] as input .
- Print the output of type(turnout_zscore). This has been done for you.
- Assign turnout_zscore to a new column in election as 'turnout_zscore'.
- Print the output of election.head(). This has been done for you, so hit 'Submit Answer' to view the result.

In [59]:
# Import zscore from scipy.stats
from scipy.stats import zscore 

In [62]:
# Call zscore with election['turnout'] as input: turnout_zscore
turnout_zscore = zscore(election["turnout"])

In [63]:
# Print the type of turnout_zscore
print(type(turnout_zscore))

<class 'numpy.ndarray'>


In [65]:
# Assign turnout_zscore to a new column: election['turnout_zscore']
election["turnout_zscore"] = turnout_zscore

In [66]:
election.head()

Unnamed: 0_level_0,state,total,Obama,Romney,winner,voters,turnout,margin,color,turnout_zscore
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Adams,PA,41973,35.482334,63.112001,Romney,61156,68.632677,27.629667,red,0.853734
Allegheny,PA,614671,56.640219,42.18582,Obama,924351,66.497575,14.454399,blue,0.439846
Armstrong,PA,28322,30.696985,67.901278,Romney,42147,67.19814,37.204293,red,0.57565
Beaver,PA,80015,46.032619,52.63763,Romney,115157,69.483401,6.605012,red,1.018647
Bedford,PA,21444,22.057452,76.98657,Romney,32189,66.619031,54.929118,red,0.463391
