# Pandas 1

## Name: Daniel Perkins

## Class: MATH 403

## Date: 7/28/24

In [3]:
import numpy as np
import pandas as pd

# Problem 1

In [4]:
# Prob 1
def prob1(file='budget.csv'):
    """"
    Read in budget.csv as a DataFrame with the index as column 0 and perform each of these operations on the DataFrame in order. 
    
    1) Reindex the columns such that amount spent on groceries is the first column and all other columns maintain the same ordering.
    2) Sort the DataFrame in descending order based on how much money was spent on Groceries.
    3) Reset all values in the 'Rent' column to 800.0.
    4) Reset all values in the first 5 data points to 0.0
    
    Return the values of the updated DataFrame as a NumPy array.
    
    Parameters:
        file (str): name of datafile
        
    Return:
        values (ndarray): values of DataFrame
    """
    data = pd.read_csv(file, index_col=0)  # Load in data
    data = data.reindex(columns=['Groceries', 'Rent', 'Utilities', 'Dining Out', 'Gas', 'Out With Friends', 'Netflix'])  # Move grocies to front
    data = data.sort_values('Groceries', ascending=False)  # Sort by money spent on groceries
    data["Rent"] = 800   # Set all rent values to 800
    data.iloc[[0, 1, 2, 3, 4], :] = 0  # Rest all values in first 5 entries to 0
    return data.values

In [5]:
print(prob1())

[[  0.   0.   0.   0.   0.   0.   0.]
 [  0.   0.   0.   0.   0.   0.   0.]
 [  0.   0.   0.   0.   0.   0.   0.]
 [  0.   0.   0.   0.   0.   0.   0.]
 [  0.   0.   0.   0.   0.   0.   0.]
 [174. 800.  90.  37.  30.  23.   8.]
 [174. 800.  82.  35.  nan  26.  nan]
 [172. 800.  82.  31.  30.  26.   8.]
 [171. 800.  82.  40.  nan  23.  nan]
 [171. 800.  82.  35.  nan  27.  nan]
 [171. 800.  80.  30.  31.  22.  nan]
 [170. 800.  90.  34.  33.  nan   8.]
 [170. 800.  85.  34.  nan  25.  nan]
 [167. 800.  92.  30.  nan  29.  nan]
 [163. 800.  85.  30.  nan  nan  nan]
 [163. 800.  90.  31.  nan  25.  nan]
 [161. 800.  85.  30.  nan  24.  nan]
 [160. 800.  91.  32.  28.  23.  nan]
 [158. 800.  92.  nan  nan  22.  nan]
 [157. 800.  82.  nan  32.  21.   8.]
 [155. 800.  80.  nan  33.  26.   8.]
 [155. 800.  92.  33.  nan  nan  nan]
 [153. 800.  80.  31.  30.  27.   8.]
 [152. 800.  95.  30.  46.  nan   8.]
 [152. 800.  85.  39.  nan  29.  nan]
 [152. 800.  95.  32.  34.  22.   8.]
 [150. 800. 

# Problem 2

In [6]:
# Prob 2
def prob2(file='budget.csv'):
    """
    Read in file as DataFrame.
    Fill all NaN values with 0.0.
    Create two new columns, 'Living Expenses' and 'Other'. 
    Sum the columns 'Rent', 'Groceries', 'Gas' and 'Utilities' and set it as the value of 'Living Expenses'.
    Sum the columns 'Dining Out', 'Out With Friends' and 'Netflix' and set as the value of 'Other'.
    Identify which column, other than 'Living Expenses' correlates most with 'Living Expenses'
    and which column other than 'Other' correlates most with 'Other'.

    Return the names of each of those columns as a tuple.
    The first should be of the column corresponding to \li{'Living Expenses'} and the second to \li{'Other'}.
    
    Parameters:
        file (str): name of datafile
        
    Return:
        values (tuple): (name of column that most relates to Living Expenses, name of column that most relates to Other)
    """
    data = pd.read_csv(file, index_col=0)  # Load in data
    data = data.fillna(0.0)  # Fill NaN values
    data["Living Expenses"] = data["Rent"] + data["Groceries"] + data["Gas"] + data["Utilities"] # Create and set values for the new column
    data["Other"] = data["Dining Out"] + data["Out With Friends"] + data["Netflix"] # Create and set values for the new column
    corr_matrx = data.corr()  # Get correlation matrix
    largest_living_expense = corr_matrx["Living Expenses"].sort_values().index[-2]  # Take second largest value of Living Expenses correlation
    largest_other_expense = corr_matrx["Other"].sort_values().index[-2]  # Take second largest value of Other correlation
    return largest_living_expense, largest_other_expense

In [7]:
print(prob2())

('Rent', 'Dining Out')


# Problem 3

In [8]:
def prob3(file='crime_data.csv'):
    """
    Read in crime data and use pandas to answer the following questions.
    
    Set the index as the column 'Year', and return the answers to each question as a tuple.
    
    1) Identify the three crimes that have a mean over 1,500,000. 
    Of these three crimes, which two are very correlated? 
    Which of these two crimes has a greater maximum value?
    Save the title of this column as a variable to return as the answer.
    
    2) Examine the data since 2000.
    Sort this data (in ascending order) according to number of murders.
    Find the years where Aggravated Assault is greater than 850,000.
    Save the indices (the years) of the masked and reordered DataFrame as a NumPy array to return as the answer.
    
    3) What year had the highest crime rate? 
    In this year, which crime was committed the most? 
    What percentage of the total crime that year was it? 
    Save this value as a float.
    
    
    Parameters:
        file (str): data
    
    Return:
        ans_1 (string): answer to Question 1
        ans_2 (ndarray): answer to Question 2
        ans_3 (float): answer to Question 3
    """
    data = pd.read_csv(file, index_col='Year')  # Load in data

    # Find the most common crimes
    mean_crimes = data.mean().sort_values()  # Get mean numbers for each crime
    most_common_crimes = [data.mean().sort_values().index[i] for i in range(-5, -2)]  # Find three largest primes
    # Find correlaation between these crimes
    correlation = data[most_common_crimes].corr()
    values = correlation.values   # Convert to numpy array
    ignore_useless_correlations = values >= 1  # Ignore correlation between the same elements
    values[ignore_useless_correlations] = 0
    # Find which two of these crimes correlate the most
    largest_index = np.argmax(values) # Find index for values of largest correlation
    largest_index = (largest_index // 3, largest_index % 3)
    correlated_common_crimes = [most_common_crimes[i] for i in largest_index]
    # Get the maximum number of each of these crimes and output the crime that is the largest
    max_crimes = [max(data[crime]) for crime in correlated_common_crimes]
    ans_1 = correlated_common_crimes[np.argmax(max_crimes)]
    # print(ans_1)
    
    # Examine data since the 2000s
    mask = data.index >= 2000
    data2000s = data[mask]
    data2000s = data2000s.sort_values("Murder")  # Sort by number of murders
    mask = data["Aggravated Assault"] > 850000  # Used to filter out years with smaller numbers
    data2000s = data2000s[mask]
    ans_2 = data2000s.index.to_numpy()  # Find the years that satisfy the above
    # print(ans_2)
    
    # Add crime_rate as a column
    data["Crime Rate"] = data["Total"] / data["Population"]    
    max_crime_rate_row = data[data['Crime Rate'] == data['Crime Rate'].max()]  # Find row with the max crime rate
    max_crime_rate_row = max_crime_rate_row.squeeze()   # Convert to a series
    most_common_crime = max_crime_rate_row.sort_values().index[-3]  # Find the most common crime
    ans_3 = max_crime_rate_row[most_common_crime] / max_crime_rate_row["Total"]  # Find this crime was of all crimes
    # print(ans_3)
    
    return ans_1, ans_2, ans_3

In [9]:
print(prob3())

('Property', array([2000, 2001, 2002, 2003, 2005, 2007, 2006]), 0.8997188308734142)


  data2000s = data2000s[mask]


# Problem 4

In [17]:
def prob4(file='DJIA.csv'):
    """

    Read the data with a DatetimeIndex as the index.
    Drop rows any rows without numerical values, cast the "VALUE" column to floats, then return the updated DataFrame.

    Parameters:
        file (str): data file
    Returns:
        df (DataFrame): updated DataFrame of stock market data
    """
    data = pd.read_csv(file, index_col=0)  # Load in data
    data.index = pd.to_datetime(data.index)  # Convert to datetime
    data = pd.to_numeric(data['VALUE'], errors='coerce') # Backup to remove non-numeric values
    data.dropna(inplace=True)
    return data

In [18]:
print(prob4())

DATE
2006-09-27    11689.24
2006-09-28    11718.45
2006-09-29    11679.07
2006-10-02    11670.35
2006-10-03    11727.34
                ...   
2016-09-20    18129.96
2016-09-21    18293.70
2016-09-22    18392.46
2016-09-23    18261.45
2016-09-26    18094.83
Name: VALUE, Length: 2517, dtype: float64


# Problem 5

In [12]:
def prob5(file='paychecks.csv'):
    """

    Create data_range for index of paycheck data.

    Parameters:
        file (str): data file
    Returns:
        df (DataFrame): DataFrame of paycheck data
    """ 
    data = pd.read_csv(file, header=None, names=["paycheck"])  # Load in data
    dates = pd.date_range(start='3/13/2008', periods=93, freq="2W-FRI") # Date range for every other Friday
    data.index = dates
    return data

In [13]:
print(prob5())

            paycheck
2008-03-14   1122.26
2008-03-28    921.03
2008-04-11    962.46
2008-04-25   1035.97
2008-05-09   1078.59
...              ...
2011-07-29   1095.53
2011-08-12   1018.39
2011-08-26   1027.08
2011-09-09   1005.90
2011-09-23    963.29

[93 rows x 1 columns]


# Problem 6

In [24]:
def prob6(file='DJIA.csv'):
    """
    Compute the following information about the DJIA dataset
    1. The single day with the largest gain
    2. The single day with the largest loss

    Parameters:
        file (str): data file
    Returns:
        max_day (<M8[ns]): DateTimeIndex of maximum change
        min_day (<M8[ns]): DateTimeIndex of minimum change
    """
    data = prob4(file)
    gains = data - data.shift(1) # Shift to find gains
    max_day = gains.idxmax()
    min_day = gains.idxmin()
    # max_day = data.index[np.argmax(gains[1:])]  # Find date of largest gain
    # min_day = data.index[np.argmin(gains[1:])]  # Find date of largest gain
    return max_day, min_day

In [25]:
print(prob6())

(Timestamp('2008-10-13 00:00:00'), Timestamp('2008-09-29 00:00:00'))
