# Following shows an example of matching control users for users with SRDD

- before this step, you need build the control user pool as stated in the paper, which is from 10 selected non-depression subreddits.
- to increase the matching efficiency, this study sampled out a subgroup of control candidates for matching every time.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
file_path = '/content/drive/MyDrive/Reddit/rematch_list_depressed_windowed.csv' # this is the file of storing users with SRDD
file_path2 = '/content/drive/MyDrive/Reddit/candidate_control_authors_info.csv' # this is the file of storing candidate control users

rematch_list = pd.read_csv(file_path, encoding='latin1')
control_condidates = pd.read_csv(file_path2, encoding='latin1')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
rematch_list

In [None]:
control_condidates = control_condidates[control_condidates['total_post'].isna()]

In [None]:
control_condidates

In [None]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import mahalanobis

# transfer Is_Mod into integer
rematch_list['Is_Mod'] = rematch_list['Is_Mod'].astype(int)
control_condidates['Is_Mod'] = control_condidates['Is_Mod'].astype(int)

# transfer author_created_utc into datetime type
rematch_list['year'] = pd.to_datetime(rematch_list['author_created_utc']).dt.year
rematch_list['month'] = pd.to_datetime(rematch_list['author_created_utc']).dt.month

control_condidates['year'] = pd.to_datetime(control_condidates['author_created_utc'],format='%d/%m/%Y %H:%M:%S').dt.year
control_condidates['month'] = pd.to_datetime(control_condidates['author_created_utc'], format='%d/%m/%Y %H:%M:%S').dt.month

# ini result list
matched_results = []

# filter users created in the same year and same month first
for year, month in rematch_list[['year', 'month']].drop_duplicates().values:

    srdd_subset = rematch_list[(rematch_list['year'] == year) & (rematch_list['month'] == month)]
    control_subset = control_condidates[(control_condidates['year'] == year) & (control_condidates['month'] == month)]
#    if control_subset.empty:
#        control_subset = control_condidates[control_condidates['year'] == year]

#    if control_subset.empty:
#        print(f"No control group for year {year}, month {month}. Using all control authors...")
#        control_subset = control_condidates

    # four user atrributes were used in matching
    dimensions = ['total_post', 'Is_Mod', 'Comment_Karma', 'Link_Karma']
    srdd_values = srdd_subset[dimensions].values
    control_values = control_subset[dimensions].values

    # check whether the sample size is enough
    if control_values.shape[0] <= control_values.shape[1]:
        print(f"Not enough samples for year {year}, month {month}. Skipping...")
        continue

    # Calculating the covariance matrix and dealing with singularities
    cov_matrix = np.cov(control_values, rowvar=False)
    try:
        inv_cov_matrix = np.linalg.inv(cov_matrix)
    except np.linalg.LinAlgError:
        print(f"Singular matrix encountered for year {year}, month {month}. Using pseudo-inverse...")
        inv_cov_matrix = np.linalg.pinv(cov_matrix)

    # find the nearest neighbour for each user with SRDD
    for idx, srdd_user in srdd_subset.iterrows():
        srdd_vector = srdd_user[dimensions].values
        min_distance = float('inf')
        best_match = None

        for _, control_user in control_subset.iterrows():
            control_vector = control_user[dimensions].values
            distance = mahalanobis(srdd_vector, control_vector, inv_cov_matrix) # compute the mahalanobis distance

            if distance < min_distance:
                min_distance = distance
                best_match = control_user['Username']

        # save the matching results
        matched_results.append({
            'srdd_author': srdd_user['author'],
            'matched_control_author': best_match,
            'year': year,
            'month': month,
            'mahalanobis_distance': min_distance
        })

# save the matching results as data frame
matched_results_df = pd.DataFrame(matched_results)

  rematch_list['year'] = pd.to_datetime(rematch_list['author_created_utc']).dt.year
  rematch_list['month'] = pd.to_datetime(rematch_list['author_created_utc']).dt.month


Not enough samples for year 2017, month 3. Skipping...
Not enough samples for year 2018, month 5. Skipping...
Not enough samples for year 2016, month 1. Skipping...
Not enough samples for year 2014, month 8. Skipping...
Not enough samples for year 2010, month 2. Skipping...
Not enough samples for year 2015, month 11. Skipping...
No control group for year 2023, month 9. Using all control authors...
No control group for year 2023, month 2. Using all control authors...
No control group for year 2008, month 4. Using all control authors...


In [None]:
matched_results_df

In [None]:
matched_results_df.to_csv('/content/drive/MyDrive/Reddit/rematch_result_windowed.csv',index = False)