In [None]:
# Initialize Otter
import otter
grader = otter.Notebook("hwk02.ipynb")

In [None]:
# Import all the modules we need
from IPython.core.display import HTML
from datascience import *

import os
import matplotlib
matplotlib.use('Agg')
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
plt.style.use('fivethirtyeight')

import warnings
warnings.simplefilter('ignore', FutureWarning)

## Homework 2: Review of survey results
### Load the survey responses

In [None]:
os.getcwd() # this is a line that tells you the current working directory. It's a helpful move before you want to load something.
            # we are in the folder of hwk2, so we can import the data directly.

In [None]:
survey = Table.read_table('ucb_fa2024_personal_networks_clean.csv')

How many responses are there?

In [None]:
num_rows = survey.num_rows
num_rows

### Who responded to the survey?

Look at the age distribution of respondents:

In [None]:
survey.select('respondent_age').hist()

Look at the gender distribution:

In [None]:
survey.group('respondent_gender').barh('respondent_gender')

Look at the class distribution

In [None]:
survey.group('respondent_class').barh('respondent_class')

### Relationship between respondent and first alter named: gender

In [None]:
#pd.crosstab(survey['respondent_gender'], survey['alter1_gender']) # this would show raw counts
pd.crosstab(survey['respondent_gender'], survey['alter1_gender'], normalize='index')

In [None]:
pd.crosstab(survey['respondent_gender'], survey['alter1_gender']) # this would show raw counts

In [None]:
obs_frac_nonhom = (23 + 43) / (93 + 23 + 43 + 48)
obs_frac_nonhom

In [None]:
# Berkeley undergrad gender breakdown source: 
# https://opa.berkeley.edu/campus-data/uc-berkeley-quick-facts
# (based on Fall 2022 undergraduate enrollment)
prop_male = 14183 / (14183 + 17808)
prop_female = 1 - prop_male

rand_expected_mf = 2 * prop_male * prop_female
rand_expected_mf

In [None]:
permuted_alter_gender = survey.select('alter1_gender').sample(num_rows) # NB: num_rows is the number of rows in our dataset
permuted_dyads = Table().with_columns(
    'respondent_gender', survey.column('respondent_gender'),
    'alter1_gender', permuted_alter_gender.column(0))
permuted_dyads

Let's write a function to help calculate the fraction of dyads that goes from male to female or from female to male.

In [None]:
def frac_mf_dyads(permuted_df):
    """
    Calculate the fraction of dyads that is male to female OR female to male
    """
    counts_mf = permuted_dyads.group(['respondent_gender', 'alter1_gender']).where('respondent_gender', 'Male').where('alter1_gender', 'Female')
    mf = counts_mf.column('count').item(0)
    counts_fm = permuted_dyads.group(['respondent_gender', 'alter1_gender']).where('respondent_gender', 'Female').where('alter1_gender', 'Male')
    fm = counts_fm.column('count').item(0)
    
    return((mf + fm)/permuted_df.num_rows)

#permuted_frac_mf = permuted_dyads.where()
frac_mf_dyads(permuted_dyads)

## Homework 2 (the part you will turn in): Homophily in personal networks

Homophily is a sociological concept that describes the phenomenon of 'like attracts like'. For example, Berkeley students might hang out more with those who are the same gender as themselves, from the same states/cities, takes the same classes, etc. We will discuss homophily in detail in future lectures, but let's explore evidence of homophily in the data that we collected.

First, we need to manipulate the data to facilitate our analysis.

Let's take many resamples and calculate the fraction of cross-gender edges for each one

In [None]:
nonhom_fracs = make_array()

for _ in np.arange(10000):
    permuted_alter_gender = survey.select('alter1_gender').sample(num_rows) # NB: num_rows is the number of rows in our dataset
    permuted_dyads = Table().with_columns(
        'respondent_gender', survey.column('respondent_gender'),
        'alter1_gender', permuted_alter_gender.column(0))
    nonhom_fracs = np.append(nonhom_fracs, frac_mf_dyads(permuted_dyads))
null_fracs = Table().with_column('frac_dyads_nonhom', nonhom_fracs)

Let's add a plot showing where our observed value is, so that we can easily compare the observed value to the null distribution.

In [None]:
null_fracs.hist('frac_dyads_nonhom')
#plt.scatter(obs_frac_nonhom,0,c='red',s=80);
plt.axvline(x=obs_frac_nonhom,c='red',linewidth=2);

We will convert the table from wide format to long format, like we did in Lab 1, using slightly modified versions of `repeat_single_col` and `wide_to_long`.

In [None]:
def repeat_single_col(data, var_name, times=5):
    """Repeats a single column multiple times.
    
    Parameters
    ----------
    data : Table
        The data table containing the column to be repeated.
    var_name : str
        Text that contains the name of the column to repeat.
    times : int
        The number of times column is to be repeated.
    
    Returns
    -------
    np.array
        A single array with the contents of the column repeated five times.
    
    Examples
    --------
    >>> repeat_single_col(Table().with_columns(['respondent_age', [10]]), 'respondent_age')
    
    array([10, 10, 10, 10, 10])
    """
    new_col = np.tile(data.column(var_name), times)
    return new_col

def wide_to_long(data, var_name, times=5):
    """Given columns of alter characteristics, stack them into one long column.
    
    Parameters
    ----------
    data : Table
        The data table containing the alter characteristics
    var_name : str
        Text that contains the variable name; columns of the dataset should
        match the pattern: alter[NUM]_[var_name]
        For example, if var_name is 'age' then this function expects to find
        columns in the survey dataset named 
        'alter1_age', 'alter2_age', 'alter3_age', 'alter4_age', and 'alter5_age'
    times : int
        The number of columns for each characteristic
    
    Returns
    -------
    np.array
        A single array with the contents of all of the columns stacked on top of one another.
    
    Examples
    --------
    >>> wide_to_long(Table().with_columns(['alter1_age', [10, 15],
                                           'alter2_age', [30, 35],
                                           'alter3_age', [20, 15],
                                           'alter4_age', [60, 70],
                                           'alter5_age', [20, 25]]),
                     'age')
    
    array([10, 15, 30, 35, 20, 15, 60, 70, 20, 25])
    """
    new_col = np.concatenate([data.column('alter' + str(idx) + '_' + var_name) for idx in range(1,times+1)])
    return new_col

Also, create an id variable for our survey respondents

In [None]:
## create a respondent id variable
survey['respondent_id'] = range(1, survey.num_rows + 1) # using [] is a short-hand approach for .with_column function

In [None]:
survey.show(6)

## Question 1: Full Alters Table

Create a long dataset that has

* respondent id
* respondent age
* respondent class
* respondent home
* alter age
* alter gender
* alter class
* alter home

Don't forget to perform a couple of checks to be sure the resulting dataset makes sense (like we did in Lab 1).

*Hint: you need to **repeat** the information for the respondent and **convert from wide to long** for the alters' information.*

In [None]:

alter_data = Table().with_columns([
    'respondent_id', repeat_single_col(survey, 'respondent_id'),
    'respondent_age', ...,
    'respondent_class', ...,
    'respondent_home', ...,
    'alter_age', wide_to_long(survey, 'age'),
    'alter_gender', ...,
    'alter_class', ...,
    'alter_home', ...,])

alter_data

In [None]:
grader.check("q1")

<!-- BEGIN QUESTION -->

## Question 2: Ages of Berkeley Students' Confidants

OK, now that we have created a long-form dataset, let's make use of it to learn about the people Berkeley students discuss important matters with.

Start by making a histogram of the confidants' ages. Please use the following value for the parameter `bins`: `np.arange(15, 70, 5)`

*Hint: you can look up the documentation of the `Table.hist()` function*

<!-- END QUESTION -->

<!-- BEGIN QUESTION -->

## Question 3: Respondent's Ages
Now make a histogram of the survey respondents' ages.

Use the following value for the parameter `bins`: `np.arange(15, 35, 1)`. You can try and see what happens if you remove this and plot with the line of code we used for alter's age. <BR>

*Hint: Make sure you use the `survey` table, not the `alter_data` table.*

<!-- END QUESTION -->

<!-- BEGIN QUESTION -->

## Question 4: Comapre histograms
Compare the two histograms. What does this tell you about homophily among confidants?

_Type your answer here, replacing this text._

<!-- END QUESTION -->

Now you can make a scatter plot comparing the ages of survey respondents and the ages of the alters.

In [None]:
alter_data.scatter('respondent_age', 'alter_age')

<!-- BEGIN QUESTION -->

## Question 5
What does this scatter plot tell you about homophily among confidants?

_Type your answer here, replacing this text._

<!-- END QUESTION -->

We are able to get a lot of descriptive information from these two datasets. Here is a practice example.

**Practice** What's the proportion of alters from Bay Area of all the alters?

In [None]:
# First, you create a variable, alter_bay which has value False if the alter is not from the Bay Area, 
# and True otherwise.
alter_bay = alter_data.column(...) == 'Bay Area' # two equal marks == constructs a comparison, the result is True (equal to) or False (not equalt to)

In [None]:
# Second, you calculate the proportion of the rows which has the alter_bay variable True.
alter_bay_proportion = ...
alter_bay_proportion

### Class year of Berkeley students' confidants

In this section, we will start to explore the relationship between respondents' class years and their alters' class years. Our approach will be to walk through one example -- the alters reported by sophomores -- in detail. Then, we will write a function to easily allow us to repeat our analysis for sophomores, juniors, and seniors.

First, let us look at the distribution of class year among all of the confidants reported. First use `group` to make a simple table with the counts of alters by class year.

In [None]:
alter_data.group('alter_class')

Now you can make a bar plot that shows those counts graphically.

In [None]:
# First we sort this table by counts of each group
alter_data.group('alter_class').sort('count', descending=True)

In [None]:
# By adding the function of plotting: .barh(the variable you want to plot), we can create a bar plot.
alter_data.group('alter_class').sort('count', descending=True).barh('alter_class')

Make another bar plot that shows the class years of survey respondents.

In [None]:
survey.group('respondent_class').sort('count', descending=True).barh('respondent_class')

Now that we have a sense of what all respondents and all of the alters look like, we can dig into the alters of a particulr class group.

## Question 6: 
Create a new table that only has alters reported by respondents who are sophomores using `where` and `are.equal_to`.

In [None]:
alters_of_sophomores = ...
q6 = alters_of_sophomores.num_rows
q6

In [None]:
grader.check("q6")

Make a plot that shows the class years reported by alters of sophomores using .barh().

In [None]:
alters_of_sophomores.group('alter_class').sort('count', descending=True).barh('alter_class')

Let's make a function called `plot_alter_class` that makes a plot of the class years of alters reported by respondents in a particular class. Your function should take as its arguments

* `data` - the alter dataset
* `class_year` - the class year of respondents to focus on
    
For example, running

    plot_alter_class(alter_data, 'Sophomore')

should produce the plot you just made above.

In [None]:
def plot_alter_class(data, class_year):
    to_plot = data.where('alter_class', class_year) #create the dataset for plotting
    to_plot.group('respondent_class').barh('respondent_class')

Use `plot_alter_class` function to produce plots of the class years of the alters of freshmen, sophomores, juniors, and seniors.

In [None]:
# freshmen
plot_alter_class(alter_data, 'Freshman')

In [None]:
# sophomores
plot_alter_class(alter_data, 'Sophomore')

In [None]:
# juniors
plot_alter_class(alter_data, 'Junior')

In [None]:
# seniors
plot_alter_class(alter_data, 'Senior')

<!-- BEGIN QUESTION -->

## Question 7
Do you see evidence of homophily with respect to class year?

_Type your answer here, replacing this text._

<!-- END QUESTION -->



---

To double-check your work, the cell below will rerun all of the autograder tests.

In [None]:
grader.check_all()

## Submission

Make sure you have run all cells in your notebook in order before running the cell below, so that all images/graphs appear in the output. The cell below will generate a zip file for you to submit. **Please save before exporting!**

Please upload the .zip file to Gradescope.

In [None]:
# Save your notebook first, then run this cell to export your submission.
grader.export(run_tests=True)