# <span style="color:darkblue"> Lecture 12: Application 2 - Random Assignment </span>

<font size = "5">



# <span style="color:darkblue"> I. Import Libraries and Data </span>


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
carfeatures = pd.read_csv("data_raw/features.csv")

# <span style="color:darkblue"> I. Random Assignment </span>

<font size = "5">

Random assignment is crucial for scientific progress ...

- The basis for medical trials
- Also used in engineering, the natural sciences and <br>
  social sciences (economics, political science, etc.)


In [3]:
# "list_status" is a list with "treatment/control" arms
# "prop_status" is the proportion in the treatment and control arms
# "size_dataset" is how many rows are contained

list_status  = ["Treatment","Control"]
prop_status  = [0.4,0.6]
size_dataset = len(carfeatures)

<font size = "5">
Random assignment


In [4]:
# The "np.random.choice" will create a vector with the status
# We will save this to a column in "carfeatures"
# Note: (i) We can always split the arguments of a function in multiple lines
#           to make it easier to read
#       (ii) 

carfeatures["status"] = np.random.choice(list_status,
                                         size = size_dataset,
                                         p = prop_status)

display(carfeatures)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,vehicle_id,status
0,18.0,8,307,130,3504,12.0,C-1689780,Treatment
1,15.0,8,350,165,3693,11.5,B-1689791,Control
2,18.0,8,318,150,3436,11.0,P-1689802,Control
3,16.0,8,304,150,3433,12.0,A-1689813,Treatment
4,17.0,8,302,140,3449,10.5,F-1689824,Control
...,...,...,...,...,...,...,...,...
393,27.0,4,140,86,2790,15.6,F-1694103,Control
394,44.0,4,97,52,2130,24.6,V-1694114,Control
395,32.0,4,135,84,2295,11.6,D-1694125,Control
396,28.0,4,120,79,2625,18.6,F-1694136,Control


<font size = "5">

Compute frequencies by status

In [5]:
# The command "pd.crosstab" computes frequencies
# If we add the option "normalize" it will compute proportions
# Note: The default assignment is done randomly without replacement
#       which means that the proportions are approximately the same   
#       (but not equal) to "prop_status"

frequency_table   = pd.crosstab(index = carfeatures["status"], columns = "Frequency")
proportions_table = pd.crosstab(index = carfeatures["status"],
                                columns = "Frequency",
                                normalize = True)

display(frequency_table)
display(proportions_table)


col_0,Frequency
status,Unnamed: 1_level_1
Control,227
Treatment,171


col_0,Frequency
status,Unnamed: 1_level_1
Control,0.570352
Treatment,0.429648


<font size = "5">

Query with string conditions

In [6]:
# When you have queries for text variables, it's important
# to use outer ' ' single quotations
# and inner double quotations.

data_treated = carfeatures.query('status == "Treatment" ')
data_control = carfeatures.query('status == "Control" ')

<font size = "5">

Treated/control should be similar

- This is the key principle of random assignment
- We can check the summary statistics

In [7]:
# The count is different because we assigned different proportions
# All other sumary statistics are approximately the same
# They are not identical because the assignment is random

display(data_treated.describe())
display(data_control.describe())

Unnamed: 0,mpg,cylinders,displacement,weight,acceleration
count,171.0,171.0,171.0,171.0,171.0
mean,22.656725,5.608187,205.994152,3034.315789,15.523392
std,7.686864,1.681646,108.168795,854.392144,2.693703
min,11.0,3.0,70.0,1613.0,8.5
25%,16.0,4.0,106.0,2238.0,13.65
50%,21.0,6.0,200.0,2945.0,15.5
75%,28.0,8.0,303.0,3727.5,17.25
max,44.3,8.0,455.0,5140.0,23.7


Unnamed: 0,mpg,cylinders,displacement,weight,acceleration
count,227.0,227.0,227.0,227.0,227.0
mean,24.160793,5.339207,183.960352,2922.295154,15.601762
std,7.866981,1.710045,100.435658,839.783725,2.810378
min,9.0,3.0,68.0,1649.0,8.0
25%,18.0,4.0,99.0,2217.5,14.0
50%,24.0,4.0,140.0,2720.0,15.5
75%,29.95,7.0,260.0,3442.0,17.0
max,46.6,8.0,455.0,4997.0,24.8


## <span style="color:darkblue"> III. Quiz Structure </span>

<font size = "5">

The day of the quiz I will ...
- Provide a dataset with information
- Give more specific instructions.
- Below, you will see the type of questions that will be asked.
- The idea is for you to apply known concepts to new data
- You have 50 minutes to complete the assignment

Questions

(exact wording may change in quiz, but exercise will be very similar)


<font size = "5">

(a) Create a function and apply it to a column

- Check Lecture 8 for how to define a function
- The function will have if/else statements and output a string
- You will use ".apply()" to create a new variable in the dataset <br>
(see Lecture 9)

In [8]:
# Define a function that takes a numeric input value and outputs a string
def convert_to_grade(score):
    if score >= 90:
        return "A"
    elif score >= 80:
        return "B"
    elif score >= 70:
        return "C"
    elif score >= 60:
        return "D"
    else:
        return "F"

df = pd.DataFrame({'scores': [85, 92, 73, 60, 78]})

# Apply the function to create a new column with grades
df['grades'] = df['scores'].apply(convert_to_grade)

# Print the resulting dataframe
print(df)

   scores grades
0      85      B
1      92      A
2      73      C
3      60      D
4      78      C


<font size = "5">

(b) Use queries + global variables

- You will be asked to compute certain summary statistics <br>
(mean, median, etc)
- The query will have multiple conditions
- Then subset a dataset that meets certain conditions
- See Lecture 10 for more details

In [None]:
# Load the dataset
df = pd.read_csv("example_data.csv")

# Compute summary statistics
mean = np.mean(df['score'])
median = np.median(df['score'])
std_dev = np.std(df['score'])

# Subset the dataset based on multiple conditions
subset_df = df[(df['score'] >= 70) & (df['gender'] == 'female')]


<font size = "5">

(c) Use sorting + ".loc[]"

- Extract the observations with the largest values of a column
- See Lecture 10 for details

In [None]:
# Load the dataset
df = pd.read_csv("example_data.csv")

# Sort the dataset by the column of interest
sorted_df = df.sort_values(by='score', ascending=False)

# Extract the observations with the largest values
top_obs = sorted_df.loc[sorted_df.index[0:5], :]


<font size = "5">

(d) Split a dataset into subsets

- You will be asked to randomly assign a status to each row
- Split the data into separate datasets using ".query()"
- This will closely follow the material in Lecture 12 (this one)
- You will need this result to answer questions (e), (f)


In [None]:
# Load the dataset
df = pd.read_csv("example_data.csv")

# Randomly assign a status to each row
np.random.seed(123)  # Set the random seed for reproducibility
df['status'] = np.random.choice(['train', 'test'], size=df.shape[0], p=[0.8, 0.2])

# Split the data into separate datasets
train_df = df.query("status == 'train'")
test_df = df.query("status == 'test'")


<font size = "5">

(e) Create a function with four inputs $f(y,x,b0,b1)$

- Start by using "def" to define the function
- The function will include arithmetic operations (Lecture 3) <br>
and summary statistics for pandas (mean, std, min, max, etc.)
- You will be asked to test different values of $(y,x,b0,b1)$
- You will get $y$ and $x$ from the two datasets in part (d)
- Note: You will **not** be required to use the "statsmodels" library


In [None]:
# Define the function
def f(y, x, b0, b1):
    # Compute the predicted values of y
    y_pred = b0 + b1 * x

    # Compute the residuals
    residuals = y - y_pred

    # Compute summary statistics for the residuals
    mean_residuals = np.mean(residuals)
    std_residuals = np.std(residuals)
    min_residuals = np.min(residuals)
    max_residuals = np.max(residuals)

    # Compute summary statistics for x
    mean_x = np.mean(x)
    std_x = np.std(x)
    min_x = np.min(x)
    max_x = np.max(x)

    # Compute summary statistics for y
    mean_y = np.mean(y)
    std_y = np.std(y)
    min_y = np.min(y)
    max_y = np.max(y)

    # Return a dictionary of results
    return {'y_pred': y_pred,
            'residuals': residuals,
            'mean_residuals': mean_residuals,
            'std_residuals': std_residuals,
            'min_residuals': min_residuals,
            'max_residuals': max_residuals,
            'mean_x': mean_x,
            'std_x': std_x,
            'min_x': min_x,
            'max_x': max_x,
            'mean_y': mean_y,
            'std_y': std_y,
            'min_y': min_y,
            'max_y': max_y}

# Test the function with example data
df1 = pd.read_csv("example_data1.csv")
df2 = pd.read_csv("example_data2.csv")
result1 = f(df1['y'], df1['x'], 2, 3)
result2 = f(df2['y'], df2['x'], 1, 4)
print(result1)
print(result2)


<font size = "5">

(f) Create two overlapping histogram plots

- You will use a variable from the two datasets in (d)
- You need to use the "alpha" option to make the graphs semitransparent
- You will need to add a legend, label the axes, and the title
- Note: The goal of this question is to illustrate that random <br>
assignment produces very similar distributions between two groups

In [None]:
# Load the data
df1 = pd.read_csv("example_data1.csv")
df2 = pd.read_csv("example_data2.csv")

# Set up the figure
fig, ax = plt.subplots()

# Create the histograms
ax.hist(df1['variable'], bins=20, alpha=0.5, label='Group 1')
ax.hist(df2['variable'], bins=20, alpha=0.5, label='Group 2')

# Add the legend, axis labels, and title
ax.legend()
ax.set_xlabel('Variable')
ax.set_ylabel('Count')
ax.set_title('Histogram of Variable by Group')

# Show the plot
plt.show()
