In [None]:
# data needed for the code
# dob_job_application_filings_subset.csv,

In [None]:
# Import pandas
import pandas as pd

# Read the file into a DataFrame: df
df = pd.read_csv('dob_job_application_filings_subset.csv')

# Print the head of df
print(df.head())

# Print the tail of df
print(df.tail())

# Print the shape of df
print(df.shape)

# Print the columns of df
print(df.columns)

# Print the head and tail of df_subset
print(df_subset.head())
print(df_subset.tail())


In [None]:
# Print the info of df
print(df.info())

# Print the info of df_subset
print(df_subset.info())


In [None]:
# As you've seen, .describe() can only be used on numeric columns.
# So how can you diagnose data issues when you have categorical data?
# One way is by using the .value_counts() method, which returns the
# frequency counts for each unique value in a column!

# This method also has an optional parameter called dropna which is
# True by default. What this means is if you have missing data in a
# column, it will not give a frequency count of them. You want to set
# the dropna column to False so if there are missing values in a column,
# it will give you the frequency counts.

# Print the value counts for 'Borough'
print(df['Borough'].value_counts(dropna=False))

# Print the value_counts for 'State'
print(df.State.value_counts(dropna = False))

# Print the value counts for 'Site Fill'
print(df['Site Fill'].value_counts(dropna = False))



In [None]:
# The .plot() method allows you to create a plot of each column of a
# DataFrame. The kind parameter allows you to specify the type of plot
# to use - kind='hist', for example, plots a histogram.

# You'll notice that there are extremely large differences between the
# min and max values, and the plot will need to be adjusted accordingly.
# In such cases, it's good to look at the plot on a log scale. The
# keyword arguments logx=True or logy=True can be passed in to .plot()
# depending on which axis you want to rescale.

# Create a histogram of the 'Existing Zoning Sqft' column. Rotate the axis
# labels by 70 degrees and use a log scale for both axes.

# Import matplotlib.pyplot
import matplotlib.pyplot as plt

# Plot the histogram
df['Existing Zoning Sqft'].plot(kind='hist', rot=70, logx=True, logy=True)

# Display the histogram
plt.show()

In [None]:
# Import necessary modules
import pandas as pd
import matplotlib.pyplot as plt

# Create the boxplot
df.boxplot(column='initial_cost', by='Borough', rot=90)

# Display the plot
plt.show()

In [None]:
# Boxplots are great when you have a numeric column that you want to
# compare across different categories. When you want to visualize two
# numeric columns, scatter plots are ideal.

# Import necessary modules
import pandas as pd
import matplotlib.pyplot as plt

# Create and display the first scatter plot
df.plot(kind='scatter', x='initial_cost', y='total_est_fee', rot=70)
plt.show()

# Create and display the second scatter plot
df_subset.plot(kind='scatter', x='initial_cost', y='total_est_fee', rot=70)
plt.show()

In [None]:
# For data to be tidy, it must have:
#   Each variable as a separate column.
#   Each row as a separate observation.
# As a data scientist, you'll encounter data that is represented in a
# variety of different ways, so it is important to be able to recognize tidy (or untidy) data when you see it.

# pd.melt(). There are two parameters you should be aware of: id_vars and value_vars.
# The id_vars represent the columns of the data you do not want to melt (i.e.,
# keep it in its current shape), while the value_vars represent the columns you do
# wish to melt into rows. By default, if no value_vars are provided, all columns
# not set in the id_vars will be melted.

# Print the head of airquality
print(airquality.head())

# Melt airquality: airquality_melt
airquality_melt = pd.melt(airquality, id_vars=['Month','Day'])

# Print the head of airquality_melt
print(airquality_melt.head())

In [None]:
# When melting DataFrames, it would be better to have column names more 
# meaningful than variable and value.

# The default names may work in certain situations, but it's best to 
# always have data that is self explanatory.

# You can rename the variable column by specifying an argument to the
# var_name parameter, and the value column by specifying an argument to
# the value_name parameter. You will now practice doing exactly this. The
# DataFrame airquality has been pre-loaded for you.

# Print the head of airquality
print(airquality.head())

# Melt airquality: airquality_melt
airquality_melt = pd.melt(airquality, id_vars=['Month', 'Day'], var_name='measurement', value_name='reading')

# Print the head of airquality_melt
print(airquality_melt.head())

In [None]:
# While melting takes a set of columns and turns it into a single column,
# pivoting will create a new column for each unique value in a specified column.

# .pivot_table() has an index parameter which you can use to specify the columns
# that you don't want pivoted: It is similar to the id_vars parameter of pd.melt().
# Two other parameters that you have to specify are columns (the name of the column
# you want to pivot), and values (the values to be used when the column is pivoted).
# Print the head of airquality_melt
print(airquality_melt.head())

# Pivot airquality_melt: airquality_pivot
airquality_pivot = airquality_melt.pivot_table(index=['Month', 'Day'], columns='measurement', values='reading')

# Print the head of airquality_pivot
print(airquality_pivot.head())

In [None]:
# There's a very simple method you can use to get back the original 
# DataFrame from the pivoted DataFrame: .reset_index(). 

# Print the index of airquality_pivot
print(airquality_pivot.index)

# Reset the index of airquality_pivot: airquality_pivot
airquality_pivot = airquality_pivot.reset_index()

# Print the new index of airquality_pivot
print(airquality_pivot.index)

# Print the head of airquality_pivot
print(airquality_pivot.head())


In [None]:
# Pivot airquality_dup: airquality_pivot
airquality_pivot = airquality_dup.pivot_table(index=['Month', 'Day'], columns='measurement', values='reading', aggfunc=np.mean)

# Reset the index of airquality_pivot
airquality_pivot = airquality_pivot.reset_index()

# Print the head of airquality_pivot
print(airquality_pivot.head())

# Print the head of airquality
print(airquality.head())


In [None]:
# Melt tb: tb_melt
tb_melt = pd.melt(frame = tb, id_vars=['country', 'year'])

# Create the 'gender' column
tb_melt['gender'] = tb_melt.variable.str[0]

# Create the 'age_group' column
tb_melt['age_group'] = tb_melt.variable.str[1:]

# Print the head of tb_melt
print(tb_melt.head())

In [None]:
# Melt ebola: ebola_melt
ebola_melt = pd.melt(ebola, id_vars=['Date', 'Day'], var_name='type_country', value_name='counts')

# Create the 'str_split' column
ebola_melt['str_split'] = ebola_melt.type_country.str.split('_')

# Create the 'type' column
ebola_melt['type'] = ebola_melt.str_split.str.get(0)

# Create the 'country' column
ebola_melt['country'] = ebola_melt.str_split.str.get(1)

# Print the head of ebola_melt
print(ebola_melt.head())


In [None]:
# Concatenate uber1, uber2, and uber3: row_concat
row_concat = pd.concat([uber1,uber2,uber3])

# Print the shape of row_concat
print(row_concat.shape)

# Print the head of row_concat
print(row_concat.head())


In [None]:
# Think of column-wise concatenation of data as stitching data
# together from the sides instead of the top and bottom. To perform
# this action, you use the same pd.concat() function, but this time
# with the keyword argument axis=1. The default, axis=0, is for
# a row-wise concatenation.

# Concatenate ebola_melt and status_country column-wise: ebola_tidy
ebola_tidy = pd.concat([ebola_melt,status_country],axis = 1)

# Print the shape of ebola_tidy
print(ebola_tidy.shape)

# Print the head of ebola_tidy
print(ebola_tidy.head())

In [None]:
# glob module has a function called glob that takes a pattern and
# returns a list of the files in the working directory that match that pattern.

# Import necessary modules
import glob
import pandas as pd

# Write the pattern: pattern
pattern = '*.csv'

# Save all file matches: csv_files
csv_files = glob.glob(pattern)

# Print the file names
print(csv_files)

# Load the second file into a DataFrame: csv2
csv2 = pd.read_csv(csv_files[1])

# Print the head of csv2
print(csv2.head())


In [None]:
# Create an empty list: frames
frames = []

#  Iterate over csv_files
for csv in csv_files:

    #  Read csv into a DataFrame: df
    df = pd.read_csv(csv)
    
    # Append df to frames
    frames.append(df)

# Concatenate frames into a single DataFrame: uber
uber = pd.concat(frames)

# Print the shape of uber
print(uber.shape)

# Print the head of uber
print(uber.head())


In [None]:
# Merging data allows you to combine disparate datasets into a 
# single dataset to do more complex analysis.

# Merge the DataFrames: o2o
o2o = pd.merge(left=site, right=visited, left_on='name', right_on='site')

# Print o2o
print(o2o)


In [None]:
# In a many-to-one (or one-to-many) merge, one of the values will be
# duplicated and recycled in the output. That is, one of the keys in
# the merge is not unique.
# Merge the DataFrames: m2o
m2o = pd.merge(left=site, right=visited, left_on='name', right_on='site')


# Print m2o
print(m2o)

In [None]:
# The final merging scenario occurs when both DataFrames do not have
# unique keys for a merge. What happens here is that for each duplicated
# key, every pairwise combination will be created.

# Merge site and visited: m2m
m2m = pd.merge(left=site, right=visited, left_on='name', right_on='site')

# Merge m2m and survey: m2m
m2m = pd.merge(left=m2m, right=survey, left_on='ident', right_on='taken')

# Print the first 20 lines of m2m
print(m2m.head(20))


# Converting data types

In [None]:
# In this exercise, you'll see how ensuring all categorical variables
# in a DataFrame are of type category reduces memory usage.
# Convert the sex column to type 'category'
tips.sex = tips.sex.astype('category')

# Convert the smoker column to type 'category'
tips.smoker = tips.smoker.astype('category')

# Print the info of tips
print(tips.info())


In [None]:
# If you expect the data type of a column to be numeric (int or float),
# but instead it is of type object, this typically means that there is
# a non numeric value in the column, which also signifies bad data.

# You can use the pd.to_numeric() function to convert a column into a
# numeric data type. If the function raises an error, you can be sure
# that there is a bad value within the column. You can either use the
# techniques you learned in Chapter 1 to do some exploratory data analysis
# and find the bad value, or you can choose to ignore or coerce the
# value into a missing value, NaN.

# Convert 'total_bill' to a numeric dtype
tips['total_bill'] = pd.to_numeric(tips['total_bill'], errors='coerce')

# Convert 'tip' to a numeric dtype
tips['tip'] = pd.to_numeric(tips['tip'],errors='coerce')

# Print the info of tips
print(tips.info())


# Regular Expression

In [1]:
# The regular expression module in python is re. When performing pattern
# matching on data, since the pattern will be used for a match across
# multiple rows, it's better to compile the pattern first using re.compile(),
# and then use the compiled pattern to match values.

# Import the regular expression module
import re

# Compile the pattern: prog
prog = re.compile('\d{3}-\d{3}-\d{4}')

# See if the pattern matches
result = prog.match('123-456-7890')
print(bool(result))

# See if the pattern matches
result = prog.match('1123-456-7890')
print(bool(result))

True
False


In [2]:
# Extracting numbers from strings is a common task, particularly when
# working with unstructured data or log files.
# Say you have the following string: 'the recipe calls for 6 strawberries
# and 2 bananas'.
# It would be useful to extract the 6 and the 2 from this string to be saved
# for later use when comparing strawberry to banana ratios.
# When using a regular expression to extract multiple numbers (or multiple
# pattern matches, to be exact), you can use the re.findall() function. Dan
# did not discuss this in the video, but it is straightforward to use: You
# pass in a pattern and a string to re.findall(), and it will return
# a list of the matches.

# Import the regular expression module
import re

# Find the numeric values: matches
matches = re.findall('\d+', 'the recipes calls for 10 strawberries and 1 banana')

# Print the matches
print(matches)


['10', '1']


In [3]:
# Write the first pattern
pattern1 = bool(re.match(pattern='\d{3}-\d{3}-\d{4}', string='123-456-7890'))
print(pattern1)

# Write the second pattern
pattern2 = bool(re.match(pattern='\$\d{3}\.\d{2}', string='$123.45'))
print(pattern2)

# Write the third pattern
pattern3 = bool(re.match(pattern='[A-Z]\w*', string='Australia'))
print(pattern3)


True
True
True


In [None]:
# The tips dataset has been pre-loaded into a DataFrame called tips.
# It has a 'sex' column that contains the values 'Male' or 'Female'.
# Your job is to write a function that will recode 'Male' to 1, 'Female'
# to 0, and return np.nan for all entries of 'sex' that are neither 'Male' nor 'Female'.

# you can use the .apply() method to apply a function across entire rows or columns
# of DataFrames. However, note that each column of a DataFrame is a pandas Series.
# Functions can also be applied across Series. Here, you will apply your function over the 'sex' column.

# Define recode_sex()
def recode_sex(sex_value):

    # Return 1 if sex_value is 'Male'
    if sex_value == 'Male':
        return 1
    
    # Return 0 if sex_value is 'Female'    
    elif sex_value == 'Female':
        return 0
    
    # Return np.nan    
    else:
        return np.nan

# Apply the function to the sex column
tips['sex_recode'] = tips.sex.apply(recode_sex)

# Print the first five rows of tips
print(tips.head())


In [None]:
# You'll now be introduced to a powerful Python feature that will help
# you clean your data more effectively: lambda functions. Instead of using
# the def syntax that you used in the previous exercise, lambda functions
# let you make simple, one-line functions.

# For example, here's a function that squares a variable used in an .apply() method:

# def my_square(x):
#     return x ** 2

# df.apply(my_square)
# The equivalent code using a lambda function is:

# df.apply(lambda x: x ** 2)
# The lambda function takes one parameter - the variable x. The function itself
# just squares x and returns the result, which is whatever the one line of code
# evaluates to. In this way, lambda functions can make your code concise and Pythonic.

# Write the lambda function using replace
tips['total_dollar_replace'] = tips.total_dollar.apply(lambda x: x.replace('$', ''))

# Write the lambda function using regular expressions
tips['total_dollar_re'] = tips.total_dollar.apply(lambda x: re.findall('\d+\.\d+', x)[0])

# Print the head of tips
print(tips.head())


In [None]:
# Create the new DataFrame: tracks
tracks = billboard[['year','artist','track','time']]

# Print info of tracks
print(tracks.info())

# Drop the duplicates: tracks_no_duplicates
tracks_no_duplicates = tracks.drop_duplicates()

# Print info of tracks
print(tracks_no_duplicates.info())


In [None]:
# Calculate the mean of the Ozone column: oz_mean
oz_mean = airquality.Ozone.mean()

# Replace all the missing values in the Ozone column with the mean
airquality['Ozone'] = airquality['Ozone'].fillna(oz_mean)

# Print the info of airquality
print(airquality.info())

In [None]:
# the .all() method together with the .notnull() DataFrame method to check
# for missing values in a column. The .all() method returns True if all values
# are True. When used on a DataFrame, it returns a Series of Booleans - one for
# each column in the DataFrame. So if you are using it on a DataFrame, like in
# this exercise, you need to chain another .all() method so that you return only
# one True or False value. When using these within an assert statement, nothing
# will be returned if the assert statement is true: This is how you can confirm
# that the data you are checking are valid.

# Note: You can use pd.notnull(df) as an alternative to df.notnull().


# The first .all() method will return a True or False for each column, while the second
# .all() method will return a single True or False.
# Assert that there are no missing values
assert pd.notnull(ebola).all().all()

# Assert that all values are >= 0
assert (ebola >= 0).all().all()