In [1]:
import geopandas
from tqdm import tqdm
from datetime import datetime
import zipfile

In [2]:
import pandas as pd
import numpy as np
import os
from shapely.geometry import shape

%load_ext lab_black
%matplotlib inline

In [3]:
class MakeMap:
    def __init__(self):

        self.base_path = os.environ["SCRATCH"]
        input_folder = "covid-map/twitter-dataset-processed-stress"
        self.input_folder_path = os.path.join(self.base_path, input_folder)

        # zip file smaple
        zip_county_shp = "twitter-action/depression/county-map/county.zip"
        self.zip_county_shp_path = (
            "zip://" + os.path.join(self.base_path, zip_county_shp) + "!data"
        )

        # shape file sample
        county_shp = "twitter-action/depression/basemap/data/data_2.shp"
        self.county_shp_path = os.path.join(self.base_path, county_shp)

        basemap = "twitter-action/depression/basemap/basemap.geojson"
        self.basemap_path = os.path.join(self.base_path, basemap)

        self.sample_input_path = "/scratch/user/diya.li/twitter-action/depression/2D-windows-stress-topic/2020-02-22.csv"

        self.map_output_path = os.path.join(
            self.base_path, "twitter-action/depression/county-map"
        )

        # get all csv path from input folder
        self.csv_path_list = self._get_df_path_list()

        self.stress_df_columns = [
            "FID",
            "cnty_fips",
            "state_name",
            "state_fips",
            "cnty_name",
            "state_abbr",
            "geometry",
            "avg_stress",
            "date",
        ]

        self.basemap_shp = geopandas.read_file(self.basemap_path)

    def test_plot(self, shp_obj):
        """deprecated function"""
        world = geopandas.read_file(geopandas.datasets.get_path("naturalearth_lowres"))
        world = world[world["continent"] == "North America"]
        # base = world.plot(color="white", edgecolor="black")
        shp_obj.plot()

    def preprocess_county_shp(self, shp_obj):
        """deprecated function"""
        shp_obj["cnty_fips"] = shp_obj["cnty_fips"].apply(lambda x: np.float64(x))

        shp_obj = shp_obj.drop(
            [
                "OBJECTID",
                "Value",
                "Shape_Leng",
                "SmoValue",
                "DrValue",
                "InacValue",
                "MedCValue",
                "Income",
                "Shape_Le_1",
                "Shape_Area",
                "Deathrate",
                "Diabetes",
            ],
            axis=1,
        )
        # shp_obj = shp_obj[shp_obj["state_name"] != "Virgin Islands of the U.S."]
        # shp_obj.to_crs("EPSG:3395")
        return shp_obj

    def all_county_stress_rate(self, df, shp_obj):
        """
        deprecated function
        """
        # shp_obj["cnty_fips"] = shp_obj["cnty_fips"].apply(lambda x: str(np.float64(x)))

        avg_stress_set = {}

        for group in df.groupby("CountyId"):
            avg_stress = sum(group[1]["stress_rate"].values) / group[1].shape[0]
            county_df_num = group[0]
            # print(county_df[0], county_df_num, avg_stress)
            # shp_obj.loc[
            #    shp_obj["cnty_fips"] == county_df_num, ["avg_stress"]
            # ] = avg_stress
            avg_stress_set[county_df_num] = avg_stress
        # print(avg_stress_set)
        shp_obj["avg_stress"] = shp_obj["cnty_fips"].apply(
            lambda x: avg_stress_set.get(x)
        )

        return shp_obj

    def write_shp(self, shp_obj, output_path, driver="ESRI Shapefile"):
        shp_obj.to_file(output_path, driver=driver)

    def _get_df_path_list(self):
        l = []
        for df_path in os.listdir(self.input_folder_path):
            if df_path.endswith("csv"):
                l.append(df_path)
        l.sort()
        return l

    def read_one_csv(self, df_path):
        df = pd.read_csv(df_path, lineterminator="\n")
        return df

    def zipdir(self, path, ziph):
        # ziph is zipfile handle
        for root, dirs, files in os.walk(path):
            for file in files:
                ziph.write(os.path.join(root, file))

    def zip_list_file(self, input_file_name):
        # input_file_path e.g. 2020-01-28
        zipf = zipfile.ZipFile(
            self.map_output_path + "/{}.zip".format(input_file_name.replace("-", "")),
            "w",
            zipfile.ZIP_DEFLATED,
        )

        input_file_path = os.path.join(self.map_output_path, input_file_name)

        self.zipdir(input_file_path, zipf)
        zipf.close()

    def read_csv_folder_with_stress(self, input_folder_path, concated=False):
        """is function is used to group the stress string by date"""
        nums = len(self.csv_path_list)
        concated_df = pd.DataFrame()
        with tqdm(total=nums) as pbar:
            for df_path in self.csv_path_list:

                df_path = os.path.join(input_folder_path, df_path)
                df = self.read_one_csv(df_path)

                # handle data
                date_string = df_path.split("/")[-1].replace(".csv", "")
                date_obj = str(pd.to_datetime(date_string)) + "+00:00"
                row_list = []

                # check path
                output_path = os.path.join(
                    os.path.abspath("./county-map"), date_string.replace("-", "")
                )

                if os.path.isdir(output_path):
                    pbar.update(1)
                    continue
                if os.path.isfile(os.path.join(output_path + ".geojson")):
                    pbar.update(1)
                    print("skipping")
                    continue

                for group in df.groupby("CountyId"):
                    countyid = group[0]
                    row = self.basemap_shp[
                        self.basemap_shp["cnty_fips"] == countyid
                    ].values.tolist()
                    if len(row) == 0:
                        continue
                    row = row[0]

                    avg_stress = sum(group[1]["stress_rate"].values) / group[1].shape[0]

                    row.append(avg_stress)

                    if concated:
                        ## MARKER concated
                        date_obj = pd.to_datetime(date_obj)
                        row.append(date_obj)
                    else:
                        row.append(date_obj)

                    row_list.append(row)

                # print(row_list[0])
                new_df = pd.DataFrame(row_list, columns=self.stress_df_columns)
                if concated:
                    ## MARKER concated
                    # combine all the csv to one file
                    concated_df = pd.concat([concated_df, new_df])
                else:
                    # write to geojson
                    gdf = geopandas.GeoDataFrame(new_df, geometry=new_df.geometry)
                    gdf_path = os.path.join(output_path + ".geojson")
                    print(gdf_path)
                    # os.mkdir(output_path)
                    MM.write_shp(gdf, gdf_path, driver="GeoJSON")
                    # self.zip_list_file(date_string)
                pbar.update(1)

            if concated:
                ## MARKER concated
                # write concated df to geojson
                concated_gdf = geopandas.GeoDataFrame(
                    concated_df, geometry=concated_df.geometry
                )
                concated_gdf_path = os.path.join(output_path + ".geojson")
                print(concated_gdf_path)
                # os.mkdir(output_path)
                MM.write_shp(concated_gdf, concated_gdf_path, driver="GeoJSON")
        # return concated_df

    def read_csv_folder(self, csv_path_list):
        """
        read all csv from a folder 
        """
        nums = len(csv_path_list)
        concated_df = pd.DataFrame()
        with tqdm(total=nums) as pbar:
            for df_path in csv_path_list:
                df_path = os.path.join(self.input_folder_path, df_path)
                date_string = df_path.split("/")[-1].replace(".csv", "")
                df = self.read_one_csv(df_path)
                df["date"] = pd.to_datetime(date_string)
                concated_df = pd.concat([concated_df, df])
                pbar.update(1)
        return concated_df


class MakeTopicMap(MakeMap):
    def __init__(self):
        MakeMap.__init__(self)

    def add_geometory_to_topic(self, df, shp_obj):
        """todo"""
        nums = df.shape[0]
        with tqdm(total=nums) as pbar:
            for index in range(nums):
                countyId = str(df.iloc[index]["CountyId"])

    def write_topic(self, concated_df):
        topic_names = ["topic_{}".format(x) for x in range(9)]
        for topic in topic_names:
            temp_df = concated_df[concated_df[topic] != 0.0]
            #     temp_df.to_csv("topic-map/{}.csv".format(topic), index=False)

In [4]:
MM = MakeMap()
MTM = MakeTopicMap()
# preprocess base map
# tmp_shp = MM.basemap_shp
# tmp_shp = MM.preprocess_county_shp(tmp_shp)
# MM.write_shp(tmp_shp, output_path="./basemap/basemap.geojson", driver="GeoJSON")
##### done


# tmp_df = MM.read_one_csv(MM.sample_input_path)
# tmp_shp = MM.basemap_shp
MM.read_csv_folder_with_stress(MM.input_folder_path)

  6%|▋         | 1/16 [00:01<00:19,  1.29s/it]

skipping


 12%|█▎        | 2/16 [00:05<00:31,  2.26s/it]

skipping


 19%|█▉        | 3/16 [00:08<00:29,  2.24s/it]

skipping


 25%|██▌       | 4/16 [00:09<00:23,  1.93s/it]

skipping


 31%|███▏      | 5/16 [00:10<00:19,  1.74s/it]

skipping


 38%|███▊      | 6/16 [00:11<00:16,  1.61s/it]

skipping


 44%|████▍     | 7/16 [00:12<00:12,  1.40s/it]

skipping


 50%|█████     | 8/16 [00:15<00:14,  1.80s/it]

skipping
/scratch/user/diya.li/twitter-action/depression/county-map/20200301.geojson


 56%|█████▋    | 9/16 [00:29<00:38,  5.43s/it]

/scratch/user/diya.li/twitter-action/depression/county-map/20200306.geojson


 62%|██████▎   | 10/16 [00:38<00:40,  6.67s/it]

/scratch/user/diya.li/twitter-action/depression/county-map/20200311.geojson


 69%|██████▉   | 11/16 [00:48<00:37,  7.43s/it]

/scratch/user/diya.li/twitter-action/depression/county-map/20200316.geojson


 75%|███████▌  | 12/16 [00:57<00:32,  8.01s/it]

/scratch/user/diya.li/twitter-action/depression/county-map/20200321.geojson


 81%|████████▏ | 13/16 [01:00<00:19,  6.64s/it]

/scratch/user/diya.li/twitter-action/depression/county-map/20200331.geojson


 88%|████████▊ | 14/16 [01:07<00:13,  6.75s/it]

/scratch/user/diya.li/twitter-action/depression/county-map/20200405.geojson


 94%|█████████▍| 15/16 [01:15<00:07,  7.08s/it]

/scratch/user/diya.li/twitter-action/depression/county-map/20200410.geojson


100%|██████████| 16/16 [01:20<00:00,  5.03s/it]


In [59]:
# concated_df = MM.read_csv_folder()

In [59]:

# tmp_shp = MM.all_county_stress_rate(concated_df_unique, tmp_shp)
# concated_df = MM.read_csv_folder_with_stress(shp_obj)

In [38]:
# MM.write_shp(tmp_shp, "./temp_map/sum.shp")

In [39]:
gdf = geopandas.GeoDataFrame(concated_df, geometry=concated_df.geometry)

In [40]:
gdf["date"].iloc[0]

'2020-01-22 00:00:00+00:00'

In [41]:
# gdf["date"] = gdf["date"].apply(lambda x: str(x))
# gdf["date"].iloc[0]

In [15]:
tmp_shp = geopandas.read_file("./basemap/basemap.geojson")
tmp_shp.head()

Unnamed: 0,FID,OBJECTID,cnty_fips,state_name,state_fips,cnty_name,state_abbr,Value,Shape_Leng,SmoValue,DrValue,InacValue,MedCValue,Income,Shape_Le_1,Shape_Area,Deathrate,Diabetes,geometry
0,1,1,1001,Alabama,1,Autauga,AL,37.6,0,28.88,6.3,30.3,17780,54.5,229109.169793,2188822000.0,396.1,13.0,"POLYGON ((-86.82067 32.34731, -86.81446 32.370..."
1,2,2,1003,Alabama,1,Baldwin,AL,31.3,0,28.96,9.3,22.6,16243,56.5,442509.254514,5829047000.0,352.4,9.3,"POLYGON ((-87.97309 31.16482, -87.93710 31.173..."
2,3,3,1005,Alabama,1,Barbour,AL,44.7,0,34.27,6.4,27.4,19547,32.9,266874.292769,3245946000.0,422.7,16.2,"POLYGON ((-85.74337 31.62624, -85.71720 31.679..."
3,4,4,1007,Alabama,1,Bibb,AL,37.9,0,34.74,5.5,34.1,19462,43.1,223820.703443,2322895000.0,437.3,13.7,"POLYGON ((-87.41986 33.01177, -87.31532 33.012..."
4,5,5,1009,Alabama,1,Blount,AL,34.6,0,31.46,5.0,27.3,18771,47.2,249967.888132,2503708000.0,434.7,12.4,"POLYGON ((-86.96799 33.86045, -86.92667 33.872..."


In [18]:
# tmp_shp = MM.preprocess_county_shp(tmp_shp)
tmp_shp.head()

Unnamed: 0,FID,cnty_fips,state_name,state_fips,cnty_name,state_abbr,geometry
0,1,1001,Alabama,1,Autauga,AL,"POLYGON ((-86.82067 32.34731, -86.81446 32.370..."
1,2,1003,Alabama,1,Baldwin,AL,"POLYGON ((-87.97309 31.16482, -87.93710 31.173..."
2,3,1005,Alabama,1,Barbour,AL,"POLYGON ((-85.74337 31.62624, -85.71720 31.679..."
3,4,1007,Alabama,1,Bibb,AL,"POLYGON ((-87.41986 33.01177, -87.31532 33.012..."
4,5,1009,Alabama,1,Blount,AL,"POLYGON ((-86.96799 33.86045, -86.92667 33.872..."


In [276]:
stress_rate_list = []
for county_df in concated_df_unique.groupby("CountyId"):

    avg_stress = sum(county_df[1]["stress_rate"].values) / county_df[1].shape[0]
    # print(county_df[0], county_df[1].shape, avg_stress)
    stress_rate_list.append(avg_stress)
    # stress_rate/num tweets