In [1]:
import geopandas
from tqdm import tqdm
from datetime import datetime

In [2]:
import pandas as pd
import numpy as np
import os

%load_ext lab_black
%matplotlib inline

In [55]:
class MakeMap:
    def __init__(self):
        self.scratch_path = "/scratch/user/diya.li/"
        self.input_folder = "twitter-action/depression/2D-windows-stress-topic"
        self.input_folder_path = os.path.join(self.scratch_path, self.input_folder)

        # zip file smaple
        self.zip_county_shp = "twitter-action/depression/county-map/county.zip"
        self.zip_county_shp_path = (
            "zip://" + os.path.join(self.scratch_path, self.zip_county_shp) + "!data"
        )

        # shape file sample
        self.county_shp = "twitter-action/depression/county-map/county-map.shp"
        self.county_shp_path = os.path.join(self.scratch_path, self.county_shp)

        self.sample_input_path = "/scratch/user/diya.li/twitter-action/depression/2D-windows-stress-topic/2020-02-22.csv"

        self.map_output_path = os.path.join(
            self.scratch_path, "twitter-action/depression/county-map"
        )

        self.csv_path_list = self._get_df_path_list()

        self.stress_df_columns = [
            "cnty_fips",
            "state_name",
            "state_fips",
            "cnty_name",
            "state_abbr",
            "geometry",
            "avg_stress",
            "date",
        ]

    def _get_df_path_list(self):
        l = []
        for df_path in os.listdir(self.input_folder_path):
            if df_path.endswith("csv"):
                l.append(df_path)
        l.sort()
        return l

    def preprocess_county_shp(self, shp_obj):
        """deprecated function"""
        shp_obj = shp_obj.drop(
            [
                "OBJECTID",
                "Value",
                "Shape_Leng",
                "SmoValue",
                "DrValue",
                "InacValue",
                "MedCValue",
                "Income",
                "Shape_Le_1",
                "Shape_Area",
                "Deathrate",
                "Diabetes",
            ],
            axis=1,
        )
        shp_obj = shp_obj[shp_obj["state_name"] != "Virgin Islands of the U.S."]
        return shp_obj

    def write_shp(self, shp_obj, output_path, driver="ESRI Shapefile"):
        shp_obj.to_file(output_path, driver=driver)

    def test_plot(self, shp_obj):
        world = geopandas.read_file(geopandas.datasets.get_path("naturalearth_lowres"))
        world = world[world["continent"] == "North America"]
        # base = world.plot(color="white", edgecolor="black")
        shp_obj.plot()

    def read_county_shp(self, shp_path):
        county_shp_obj = geopandas.read_file(shp_path)
        county_shp_obj["cnty_fips"] = county_shp_obj["cnty_fips"].apply(
            lambda x: np.float64(x)
        )
        return county_shp_obj

    def read_one_csv(self, df_path):
        df = pd.read_csv(df_path, lineterminator="\n")
        return df

    def get_stress_item():
        pass

    def read_csv_folder_with_stress(self, shp_obj):
        """is function is used to group the stress string by date"""
        nums = len(self.csv_path_list)
        concated_df = pd.DataFrame()
        with tqdm(total=nums) as pbar:
            for df_path in self.csv_path_list:
                df_path = os.path.join(self.input_folder_path, df_path)
                df = self.read_one_csv(df_path)

                date_string = df_path.split("/")[-1].replace(".csv", "")
                date_obj = str(pd.to_datetime(date_string)) + "+00:00"
                row_list = []

                for group in df.groupby("CountyId"):
                    row = shp_obj[shp_obj["cnty_fips"] == group[0]].values.tolist()
                    if len(row) == 0:
                        continue
                    row = row[0]

                    avg_stress = sum(group[1]["stress_rate"].values) / group[1].shape[0]

                    row.append(avg_stress)
                    row.append(date_obj)

                    row_list.append(row)

                # print(row_list[0])
                new_df = pd.DataFrame(row_list, columns=self.stress_df_columns)
                concated_df = pd.concat([concated_df, new_df])

                # write to geojson
                gdf = geopandas.GeoDataFrame(concated_df, geometry=concated_df.geometry)
                MM.write_shp(
                    gdf, "./temp_map/{}.gpkg".format(date_string), driver="GPKG"
                )
                pbar.update(1)
        return concated_df

    def read_csv_folder(self):
        nums = len(self.csv_path_list)
        concated_df = pd.DataFrame()
        with tqdm(total=nums) as pbar:
            for df_path in self.csv_path_list:
                df_path = os.path.join(self.input_folder_path, df_path)
                date_string = df_path.split("/")[-1].replace(".csv", "")
                df = self.read_one_csv(df_path)
                df["date"] = pd.to_datetime(date_string)
                concated_df = pd.concat([concated_df, df])
                pbar.update(1)
        return concated_df

    def add_geometory(self, df, shp_obj):
        """todo"""
        nums = df.shape[0]
        with tqdm(total=nums) as pbar:
            for index in range(nums):
                countyId = str(df.iloc[index]["CountyId"])

    def all_county_stress_rate(self, df, shp_obj):
        # shp_obj["cnty_fips"] = shp_obj["cnty_fips"].apply(lambda x: str(np.float64(x)))

        avg_stress_set = {}

        for group in df.groupby("CountyId"):
            avg_stress = sum(group[1]["stress_rate"].values) / group[1].shape[0]
            county_df_num = group[0]
            # print(county_df[0], county_df_num, avg_stress)
            # shp_obj.loc[
            #    shp_obj["cnty_fips"] == county_df_num, ["avg_stress"]
            # ] = avg_stress
            avg_stress_set[county_df_num] = avg_stress
        # print(avg_stress_set)
        shp_obj["avg_stress"] = shp_obj["cnty_fips"].apply(
            lambda x: avg_stress_set.get(x)
        )

        return shp_obj

    def write_topic(self, concated_df):
        topic_names = ["topic_{}".format(x) for x in range(9)]
        for topic in topic_names:
            temp_df = concated_df[concated_df[topic] != 0.0]
            #     temp_df.to_csv("topic-map/{}.csv".format(topic), index=False)

In [56]:
MM = MakeMap()
# tmp_df = MM.read_one_csv(MM.sample_input_path)
# MM.write_shp(tmp_shp)
# MM.test_plot(tmp_shp)

In [57]:
# tmp_df = MM.read_one_csv(MM.sample_input_path)

In [58]:
# tmp_df["created_at"].iloc[0]

In [59]:
# concated_df = MM.read_csv_folder()
# concated_df_unique = concated_df.drop_duplicates(subset="cleaned_text")

In [60]:
shp_obj = MM.read_county_shp(MM.county_shp_path)
# tmp_shp = MM.all_county_stress_rate(concated_df_unique, tmp_shp)
concated_df = MM.read_csv_folder_with_stress(shp_obj)

100%|██████████| 69/69 [15:13<00:00, 13.24s/it]


In [36]:
concated_df["date"].iloc[0]

'2020-01-22 00:00:00+00:00'

In [37]:
concated_df.head()

Unnamed: 0,cnty_fips,state_name,state_fips,cnty_name,state_abbr,geometry,avg_stress,date
0,1121.0,Alabama,1,Talladega,AL,"POLYGON ((-9627906.4789 3910078.557800002, -96...",0.69862,2020-01-22 00:00:00+00:00
1,4007.0,Arizona,4,Gila,AZ,"POLYGON ((-12411049.494 4029747.775899999, -12...",0.510247,2020-01-22 00:00:00+00:00
2,4013.0,Arizona,4,Maricopa,AZ,"POLYGON ((-12361191.7715 3958648.675099999, -1...",0.669872,2020-01-22 00:00:00+00:00
3,4023.0,Arizona,4,Santa Cruz,AZ,"POLYGON ((-12295524.7839 3676678.829099998, -1...",0.941334,2020-01-22 00:00:00+00:00
4,5043.0,Arkansas,5,Drew,AR,"POLYGON ((-10239094.5889 3948497.221799999, -1...",0.751438,2020-01-22 00:00:00+00:00


In [38]:
# MM.write_shp(tmp_shp, "./temp_map/sum.shp")

In [39]:
gdf = geopandas.GeoDataFrame(concated_df, geometry=concated_df.geometry)

In [40]:
gdf["date"].iloc[0]

'2020-01-22 00:00:00+00:00'

In [41]:
# gdf["date"] = gdf["date"].apply(lambda x: str(x))
# gdf["date"].iloc[0]

In [42]:
MM.write_shp(gdf, "./temp_map/sum-withdate.shp", driver="ESRI Shapefile")

In [276]:
stress_rate_list = []
for county_df in concated_df_unique.groupby("CountyId"):

    avg_stress = sum(county_df[1]["stress_rate"].values) / county_df[1].shape[0]
    # print(county_df[0], county_df[1].shape, avg_stress)
    stress_rate_list.append(avg_stress)
    # stress_rate/num tweets

In [33]:
Date()

NameError: name 'Date' is not defined