In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import datetime 
import re, os
from pathlib import Path 
import glob
import wordcloud
from wordcloud import WordCloud 
from wordcloud import STOPWORDS
from collections import Counter
import seaborn as sns
import string
from nltk import word_tokenize
import stopwords
import nltk
from nltk.corpus import stopwords

files = os.path.join ("/Users/diegoschummer/Desktop/STAT303-1/Project/data_new", "new_nuallcomments*.csv") 
files = glob.glob (files)
merged = pd.concat (map(pd.read_csv, files), ignore_index=True) 

merged = merged.drop(columns = ['id', 'permalink', 'subreddit', 'created', 'd_','Unnamed: 0'])

## Declare Season Distribution


academic_seasons = {(9,10,11,12): 'Fall', (1, 2, 3): 'Winter', (4,5,6 ): 'Spring', (7,8): 'Summer'}
normal_seasons = {(9, 10, 11):'Fall',   (1, 12, 2): 'Winter', (3, 4, 5): 'Spring', (6, 7, 8):'Summer'}

## Get Month numeric

merged['date'] = pd.to_datetime(merged['created_utc'],errors='coerce').dt.date
merged=merged.drop(columns=['created_utc'])
merged['month'] = merged.apply(lambda row: row.date.month, axis=1)

#input: dataset with 'month' column (1,2,3...) & dictionary mapping season
#output: dataset with 'season' column
def calendar_season(data,season_list):

    def season(ser):
        for k in season_list.keys():
            if ser in k:
                return season_list[k]

    data['season'] = data.month.apply(season)
    return data

merged_normal=calendar_season(merged,normal_seasons)
merged_academic=calendar_season(merged,academic_seasons)

Fall_normal = (merged_normal[merged_normal['season']=='Fall'])
Winter_normal = (merged_normal[merged_normal['season']=='Winter'])
Summer_normal = (merged_normal[merged_normal['season']=='Summer'])
Spring_normal = (merged_normal[merged_normal['season']=='Spring'])

Fall_academic = (merged_academic[merged_academic['season']=='Fall'])
Winter_academic = (merged_academic[merged_academic['season']=='Winter'])
Summer_academic = (merged_academic[merged_academic['season']=='Summer'])
Spring_academic = (merged_academic[merged_academic['season']=='Spring'])
stopwords = set(STOPWORDS)
stopwords.update (['every','live','getting','everyone','program','day','nice','lol','better','best','experience','Äô','Äôt','Äôm','Äôs','campus','yes','anything','maybe','d','might','covid','thank','ll','look','back','course','college','ve','students','pretty','feel','right','cs','probably','something','https','yeah','actually','many','taking','things','school','student','major','u','people','around','going','years','definitely','two','said','now','see','way','m','first','quarter','took','sure','week','find','good','need','year','think','nu','northwestern','well','know','classes','class','want','don','even','make','still','last','need''think','much','take','will', 'really', 'though', 'thing', 'one', 'fall', 'winter', 'spring', 'summer', 's','t','go', 'got', 'lot', 're', 'say'])

# remove words of non descriptive value
def clean(text):
    text=word_tokenize(text)
    text=[word.lower() for word in text if word.isalpha()]
    punct=str.maketrans('','',string.punctuation)
    text=[word.translate(punct) for word in text]
    text=[word for word in text if not word in stopwords]
    return text

#count frequency of all words in text
def counter(text):
    cnt=Counter()
    for msgs in text:
        for word in msgs:
            cnt[word]+=1
    return cnt

#input: string
#output: cleaned frequeny count of words 
def season_word_count(text):
    text=text.apply(clean)
    text_cnt=counter(text)
    common_words=text_cnt.most_common(10)
    common_words=pd.DataFrame(common_words,columns=['Words','Counts'])
    return common_words

sns.set()
fig, axes = plt.subplots(2,2,figsize=(26,16))
plt.subplots_adjust(wspace=0.2) #adjusting white space between individual plotssns.barplot(y='Words',x='Counts',data=season_word_count(Fall['body']))
sns.barplot(ax=axes[0,0],y='Words',x='Counts',data=season_word_count(Fall_academic['body']),palette="YlOrRd")
axes[0,0].set_title("Fall")
sns.barplot(ax=axes[0,1],y='Words',x='Counts',data=season_word_count(Winter_academic['body']),palette="ch:s=.25,rot=-.25")
axes[0,1].set_title("Winter")
sns.barplot(ax=axes[1,0],y='Words',x='Counts',data=season_word_count(Spring_academic['body']),palette='Greens')
axes[1,0].set_title("Spring")
sns.barplot(ax=axes[1,1],y='Words',x='Counts',data=season_word_count(Summer_academic['body']),palette='Reds')
axes[1,1].set_title("Summer")
fig.suptitle('Most Common Words Used by Academic Seasons ', fontsize=30)

# Weather
from datetime import datetime
import matplotlib.pyplot as plt
from meteostat import Point, Daily
import math
import numpy as np

def get_weather(data,start,end):

    # Set time period
    start = datetime(start[0],start[1],start[2])
    end = datetime(end[0],end[1],end[2])
    # Create Point for Evanston
    evanston = Point(42.0411, -87.6901, 70)
    # Get daily data for 2022
    data = Daily(evanston, start, end)
    data = data.fetch()
    #convert C to F 
    data['tavg']=(data['tavg'] *9/5) +32
    # Plot line chart including average, minimum and maximum temperature
    return data
#Get Evanston Temperature Daily Avgs
all_weather=[]
all_weather=get_weather(all_weather,[2019,1,1],[2022,10,31])
all_weather=all_weather[['tavg']]
all_weather.head()
all_weather=all_weather.rename_axis('date')

merged_neg = pd.concat (map(pd.read_csv, files), ignore_index=True) 
merged_neg = merged_neg.drop(columns = ['id', 'permalink', 'subreddit', 'created', 'd_','Unnamed: 0'])
merged_neg=merged_neg.reset_index()
merged_neg=merged_neg.drop(columns=['index']).set_index('date')
neg_temp=merged_neg.join(all_weather)
neg_temp=neg_temp.reset_index()
neg_temp=neg_temp.set_index('month')
neg_temp=neg_temp.groupby(neg_temp.index).agg({'negative_count':'sum','tavg':'mean'})
neg_temp=neg_temp.groupby(neg_temp.index).sum()
neg_temp=neg_temp.reset_index()
neg_temp=neg_temp.rename(columns={"negative_count": "help_mentions"})
neg_temp[["help_mentions",'tavg']].corr()


#Depressed VS Temperature
merged_normal_pos=calendar_season(merged,normal_seasons)
#count frequency of all words in text
def daily_counter_pos(text):
    text['positive_count']= text['body'].str.count("depressed")
    return text
merged_normal_pos=daily_counter_pos(merged_normal_pos)

merged_normal_pos=merged_normal_pos.set_index('date')
merged_normal_pos=merged_normal_pos[['positive_count']]
merged_normal_pos=merged_normal_pos.groupby('date')['positive_count'].sum()
merged_normal_pos=pd.DataFrame(merged_normal_pos)
normal_temp_pos=merged_normal_pos.join(all_weather)
fig, axes = plt.subplots(figsize=(30,10))
sns.lineplot(data = normalize(normal_temp_pos.reset_index()), x = 'date',y='positive_count')
sns.lineplot(data =normalize(normal_temp_pos.reset_index()), x = 'date',y='tavg',color='green')
axes.legend(['Depressed Count','Daily Avg Temp'])
axes.set_title('Temperature VS mentions of depressed in Posts')