In [1]:
import pandas as pd
import numpy as np
import os
import ujson
import string
import re
import geopandas
import nltk

%load_ext lab_black
%matplotlib inline

In [3]:
class StressTopicMap:
    def __init__(self):
        scratch_path = os.environ["SCRATCH"]

        # input csv folder
        input_folder = "covid-map/twitter-dataset-processed-topic-2W"
        input_folder_path = os.path.join(scratch_path, input_folder)
        self.input_folder_path_list = [
            os.path.join(input_folder_path, x)
            for x in os.listdir(input_folder_path)
            if x.endswith("csv")
        ]

        # basemap path
        basemap = "twitter-action/depression/basemap/basemap.geojson"
        self.basemap_path = os.path.join(scratch_path, basemap)

        # basemap object
        self.basemap_obj = self.read_basemap(self.basemap_path)

        # topic columns name
        self.topic_columns = ["topic_{}".format(x) for x in range(9)]

        # result pathj j m k i k l
        output_folder = "covid-map/map/county-stress-phq-9-2W"
        self.output_folder_path = os.path.join(scratch_path, output_folder)

        # output columns
        self.output_columns = ["cnty_fips", "stress_rate"] + self.topic_columns

    def read_basemap(self, basemap_path):
        bs = geopandas.read_file(basemap_path)
        bs["cnty_fips"] = bs["cnty_fips"].astype(np.int64)
        return bs

    def write_to_geojson(self, map_obj, output_path):
        print("writing to", output_path)
        map_obj.to_file(output_path, driver="GeoJSON")

    def merge_basemap(self, df):
        # note: by cnty_fips
        return pd.merge(self.basemap_obj, df, on="cnty_fips")

    def groupby_county(self, df):
        result_list_all = []
        for county_id, county_df in df.groupby("cnty_fips"):

            result_list = [county_id]

            # append values
            avg_stress = np.mean(county_df[["stress_rate"]].values)
            result_list.append(avg_stress)

            # caculate level
            county_df = county_df.dropna(subset=self.topic_columns)
            topic_sum = []
            for each_topic in self.topic_columns:
                # append each topic count to a list
                topic_sum.append(sum(county_df[each_topic]))
            result_list.extend(topic_sum)
            result_list_all.append(result_list)
            # generate result df
        result_df = pd.DataFrame(result_list_all, columns=self.output_columns)
        return result_df

    def process_one(self, df_path):

        df = pd.read_csv(df_path, lineterminator="\n")
        df = df[df["CountyId"].notnull()]
        df["cnty_fips"] = df["CountyId"].astype(np.int64)
        df.drop("CountyId", axis=1, inplace=True)

        # get county_df
        county_df = self.groupby_county(df)

        # merge to basemap
        county_map = self.merge_basemap(county_df)

        return county_map

    def process_all(self, df_path_list):
        for df_path in df_path_list:

            output_filename = df_path.split("/")[-1]
            output_filename = output_filename.replace("csv", "geojson")
            output_path = os.path.join(self.output_folder_path, output_filename)
            if os.path.isfile(output_path):
                print("skipping", output_path)
                continue

            county_map = self.process_one(df_path)
            self.write_to_geojson(county_map, output_path)
        print("all done")


STM = StressTopicMap()

In [4]:
input_folder_path_list = STM.input_folder_path_list
# df_path = input_folder_path_list[0]
# df = STM.process_one(df_path)
STM.process_all(input_folder_path_list)

writing to /scratch/user/diya.li/covid-map/map/county-stress-phq-9-2W/2020-02-09.geojson
writing to /scratch/user/diya.li/covid-map/map/county-stress-phq-9-2W/2020-03-22.geojson
writing to /scratch/user/diya.li/covid-map/map/county-stress-phq-9-2W/2020-03-08.geojson
writing to /scratch/user/diya.li/covid-map/map/county-stress-phq-9-2W/2020-04-05.geojson
writing to /scratch/user/diya.li/covid-map/map/county-stress-phq-9-2W/2020-01-26.geojson
writing to /scratch/user/diya.li/covid-map/map/county-stress-phq-9-2W/2020-02-23.geojson
writing to /scratch/user/diya.li/covid-map/map/county-stress-phq-9-2W/2020-04-19.geojson
all done
