# Pandas 1

## Matthew Mella

## 403 001

## 09/05/23

In [1]:
import numpy as np
import pandas as pd

# Problem 1

In [42]:
# Prob 1
def prob1(file='budget.csv'):
    """"
    Read in budget.csv as a DataFrame with the index as column 0 and perform each of these operations on the DataFrame in order. 
    
    1) Reindex the columns such that amount spent on groceries is the first column and all other columns maintain the same ordering.
    2) Sort the DataFrame in descending order based on how much money was spent on Groceries.
    3) Reset all values in the 'Rent' column to 800.0.
    4) Reset all values in the first 5 data points to 0.0
    
    Return the values of the updated DataFrame as a NumPy array.
    
    Parameters:
        file (str): name of datafile
        
    Return:
        values (ndarray): values of DataFrame
    """
    # read in the code
    budget_df = pd.read_csv(file, index_col=0)

    # reindex and sort
    budget_df = budget_df.reindex(columns=['Groceries','Rent','Utilities','Dining Out','Gas','Out With Friends','Netflix'])
    budget_df = budget_df.sort_values('Groceries', ascending=False)

    # reset column and row values
    budget_df['Rent'] = 800.0
    budget_df.iloc[range(5),range(len(budget_df.columns))] = 0.0

    # display(budget_df)
    
    return budget_df.to_numpy()

prob1()


array([[  0.,   0.,   0.,   0.,   0.,   0.,   0.],
       [  0.,   0.,   0.,   0.,   0.,   0.,   0.],
       [  0.,   0.,   0.,   0.,   0.,   0.,   0.],
       [  0.,   0.,   0.,   0.,   0.,   0.,   0.],
       [  0.,   0.,   0.,   0.,   0.,   0.,   0.],
       [174., 800.,  90.,  37.,  30.,  23.,   8.],
       [174., 800.,  82.,  35.,  nan,  26.,  nan],
       [172., 800.,  82.,  31.,  30.,  26.,   8.],
       [171., 800.,  82.,  40.,  nan,  23.,  nan],
       [171., 800.,  82.,  35.,  nan,  27.,  nan],
       [171., 800.,  80.,  30.,  31.,  22.,  nan],
       [170., 800.,  90.,  34.,  33.,  nan,   8.],
       [170., 800.,  85.,  34.,  nan,  25.,  nan],
       [167., 800.,  92.,  30.,  nan,  29.,  nan],
       [163., 800.,  85.,  30.,  nan,  nan,  nan],
       [163., 800.,  90.,  31.,  nan,  25.,  nan],
       [161., 800.,  85.,  30.,  nan,  24.,  nan],
       [160., 800.,  91.,  32.,  28.,  23.,  nan],
       [158., 800.,  92.,  nan,  nan,  22.,  nan],
       [157., 800.,  82.,  nan,

# Problem 2

In [41]:
# Prob 2
def prob2(file='budget.csv'):
    """
    Read in file as DataFrame.
    Fill all NaN values with 0.0.
    Create two new columns, 'Living Expenses' and 'Other'. 
    Sum the columns 'Rent', 'Groceries', 'Gas' and 'Utilities' and set it as the value of 'Living Expenses'.
    Sum the columns 'Dining Out', 'Out With Friends' and 'Netflix' and set as the value of 'Other'.
    Identify which column, other than 'Living Expenses' correlates most with 'Living Expenses'
    and which column other than 'Other' correlates most with 'Other'.

    Return the names of each of those columns as a tuple.
    The first should be of the column corresponding to \li{'Living Expenses'} and the second to \li{'Other'}.
    
    Parameters:
        file (str): name of datafile
        
    Return:
        values (tuple): (name of column that most relates to Living Expenses, name of column that most relates to Other)
    """
    # read in the code
    budget_df = pd.read_csv(file, index_col=0)
    budget_df.fillna(0.0, inplace=True)

    # create new columns
    budget_df['Living Expenses'] = budget_df[['Rent','Groceries','Gas','Utilities']].sum(axis=1)
    budget_df['Other'] = budget_df[['Dining Out','Out With Friends','Netflix']].sum(axis=1)

    # create correlation df
    corr_df = budget_df.corr()

    # Get the series of the correlations sorted, with the value of 1.0 removed
    liv_rank = corr_df['Living Expenses'].sort_values(ascending=False)[1:]
    other_rank = corr_df['Other'].sort_values(ascending=False)[1:]

    # return the index of the new first entry
    return liv_rank.index[0], other_rank.index[0]

prob2()


('Rent', 'Dining Out')

# Problem 3

In [40]:
def prob3(file='crime_data.csv'):
    """
    Read in crime data and use pandas to answer the following questions.
    
    Set the index as the column 'Year', and return the answers to each question as a tuple.
    
    1) Identify the three crimes that have a mean over 1,500,000. 
    Of these three crimes, which two are very correlated? 
    Which of these two crimes has a greater maximum value?
    Save the title of this column as a variable to return as the answer.
    
    2) Examine the data since 2000.
    Sort this data (in ascending order) according to number of murders.
    Find the years where Aggravated Assault is greater than 850,000.
    Save the indices (the years) of the masked and reordered DataFrame as a NumPy array to return as the answer.
    
    3) What year had the highest crime rate? 
    In this year, which crime was committed the most? 
    What percentage of the total crime that year was it? 
    Save this value as a float.
    
    
    Parameters:
        file (str): data
    
    Return:
        ans_1 (string): answer to Question 1
        ans_2 (ndarray): answer to Question 2
        ans_3 (float): answer to Question 3
    """

    # Part 1

    # load the DataFrame and save the index
    crime_df = pd.read_csv(file)
    crime_df.set_index('Year', inplace=True)

    # create the mask based on the mean
    mean_df = crime_df.mean()
    mean_mask = mean_df > 1500000

    # save the columns of crimes that come from the mask
    crime_cols = crime_df[mean_df[mean_mask][2:].index]

    # save the correlation for the columns
    corr_df = crime_cols.corr()

    # mask the df to remove the trivial correlations
    corr_mask = corr_df < 1
    df_masked = corr_df[corr_mask]
    
    # gather the name of the correlated columns
    corr_cols = df_masked.stack().idxmax()

    # save the max of the two crimes
    max_df = crime_df[list(corr_cols)].max()
    
    # save the name of the maximum crime
    ans_1 = max_df.idxmax()

    # Part 2

    # sort the crime df for the 2000s
    crime_df_2000s = crime_df.loc[2000:]

    # sort based on ascending murder rate
    crime_df_2000s = crime_df_2000s.sort_values('Murder', ascending=True)
    
    # create the mask based on aggravated assault
    agg_assault_mask = crime_df_2000s['Aggravated Assault'] > 850000
    
    ans_2 = crime_df_2000s[agg_assault_mask].index.to_numpy()

    # Part 3

    # find the crime rate as a ratio
    max_crime_year = (crime_df['Total'] / crime_df['Population']).idxmax()

    max_year_series = crime_df.loc[max_crime_year]

    max_crime = max_year_series[2:].idxmax()

    ans_3 = max_year_series[max_crime] / max_year_series['Total']

    return ans_1,ans_2,ans_3


prob3()


('Property',
 array([2000, 2001, 2002, 2003, 2005, 2007, 2006]),
 0.8997188308734142)

# Problem 4

In [37]:
def prob4(file='DJIA.csv'):
    """

    Read the data with a DatetimeIndex as the index.
    Drop rows any rows without numerical values, cast the "VALUE" column to floats, then return the updated DataFrame.

    Parameters:
        file (str): data file
    Returns:
        df (DataFrame): updated DataFrame of stock market data
    """

    # read in the .csv file
    df = pd.read_csv('DJIA.csv')

    # create the date index
    date_index = pd.to_datetime(df["DATE"])

    # convert the value of the col in to floats, labeling errors as NaN
    value_col = pd.to_numeric(df["VALUE"], errors='coerce').astype(float)

    # create a dataframe with the type-casted columns and the date index
    DJIA = pd.DataFrame({"VALUE" : value_col.to_list()}, index=date_index)

    return DJIA.dropna()

prob4()

Unnamed: 0_level_0,VALUE
DATE,Unnamed: 1_level_1
2006-09-27,11689.24
2006-09-28,11718.45
2006-09-29,11679.07
2006-10-02,11670.35
2006-10-03,11727.34
...,...
2016-09-20,18129.96
2016-09-21,18293.70
2016-09-22,18392.46
2016-09-23,18261.45


# Problem 5

In [38]:
def prob5(file='paychecks.csv'):
    """

    Create data_range for index of paycheck data.

    Parameters:
        file (str): data file
    Returns:
        df (DataFrame): DataFrame of paycheck data
    """
    # Create the DateTimeIndex, starting with the first paycheck over the next 93 checks.
    time_index = pd.date_range(start="2008-03-14", periods=93, freq='2W-FRI')

    # load the df, set the index, and name the columns
    df = pd.read_csv('paychecks.csv', header=None)
    df.set_index(time_index, inplace=True)
    df.index.name = "DATE"
    df.rename(columns={0: 'VALUE'}, inplace=True)
    
    return df

prob5()

Unnamed: 0_level_0,VALUE
DATE,Unnamed: 1_level_1
2008-03-14,1122.26
2008-03-28,921.03
2008-04-11,962.46
2008-04-25,1035.97
2008-05-09,1078.59
...,...
2011-07-29,1095.53
2011-08-12,1018.39
2011-08-26,1027.08
2011-09-09,1005.90


# Problem 6

In [39]:
def prob6(file='DJIA.csv'):
    """
    Compute the following information about the DJIA dataset
    1. The single day with the largest gain
    2. The single day with the largest loss

    Parameters:
        file (str): data file
    Returns:
        max_day (<M8[ns]): DateTimeIndex of maximum change
        min_day (<M8[ns]): DateTimeIndex of minimum change
    """
    # Load the df from problem 4
    df = prob4()

    # create a dataframe of the difference
    df_change = df.diff()

    # create a series, used to use argmax and argmin values
    change_series = df_change["VALUE"]
    max_index = change_series.argmax()
    min_index = change_series.argmin()

    # return a tuple with the date of the max_index and min_index
    return(df_change.iloc[max_index].name, df_change.iloc[min_index].name)

prob6()

(Timestamp('2008-10-13 00:00:00'), Timestamp('2008-09-29 00:00:00'))