[View in Colaboratory](https://colab.research.google.com/github/gowtham91m/gofundme/blob/master/process.ipynb)

In [51]:
import pandas as pd
import re
import os
import nltk
import numpy as np
from nltk import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [47]:
campaigns = pd.read_csv('https://raw.githubusercontent.com/gowtham91m/gofundme/master/data/campaigns.csv')
recent_donation = campaigns['recent_donation_time'].str.split(' ', expand=True).iloc[:,0:2]
print(recent_donation.iloc[:,1].unique())

# update all rows to month level information, since we don't have day level informaiton after one month.
# anyting less than a month is 0 
# all the data will be in months, we can retain the numbers alone.
recent_donation['months_since_last_donation'] = np.where(~recent_donation.iloc[:,1].isin(['months','month']), 0, recent_donation.iloc[:,0])
#ampaigns = campaigns.join(recent_donation['months_since_last_donation'])
recent_donation.head()

['days' 'month' 'day' 'hours' 'months' 'hour' 'mins' 'min']


Unnamed: 0,0,1,months_since_last_donation
0,15,days,0
1,1,month,1
2,1,day,0
3,1,month,1
4,23,hours,0


In [0]:
# same transfromation will be applied to duration and goals_reached_time columns 

In [0]:
# putting everything together

class clean_data():
  def __init__(self):
    self.text_columns = ['category','title','city','state','text']
    self.num_cols_to_clean = ['goal','raised','likes','shares','donation_count']
    self.time_cols = ['duration','recent_donation_time','goal_reaeched_time']
    
  def str_to_num(self,x):
      x = re.sub('[,\$\.]','',x)
      x = re.sub('M','000000',x)
      x = re.sub('k','000',x)
      return x
    
  def remove_stop_words(self,x):
    stop_words = set(stopwords.words('english'))
    x_words = word_tokenize(x)
    x_words = [i for i in x_words if not i in stop_words]
    return ' '.join(x_words)
  
  def time_parser(self,x):
    x = x.str.split(' ',2, expand=True)
    return np.where(~x.iloc[:,1].isin(['months','month']), 0, x.iloc[:,0])

  def text_process(self,x):
    x = x.lower()
    # replace non alpha numerical characters with space character
    x = re.sub('[^a-z0-9]',' ',x)
    
    # replace multiple spaceses with single space 
    x = re.sub('(  +)', ' ', x)
    
    # remove stop words
    x = self.remove_stop_words(x)
    return x
  
  def process(self,df):
    df[self.num_cols_to_clean] = df[self.num_cols_to_clean].applymap(self.str_to_num)
    df = df.join(df['location'].str.split(',', 1, expand=True).rename(columns={0:'city', 1:'state'}))
    df[self.text_columns] = df[self.text_columns].applymap(self.text_process)
    df.drop(['location','href'],inplace=True,axis=1)
    
    df['start_date'] = pd.to_datetime(df.start_date)
    df['script_run_date'] = pd.to_datetime(df.script_run_date)
    
    # process the time range variables, duration (overa all campaign time), goal_reached_time, recent_donation_time
    df[self.time_cols] = df[self.time_cols].apply(self.time_parser)
    
    return df

In [0]:
if __name__ == '__main__':
  campaigns = pd.read_csv('https://raw.githubusercontent.com/gowtham91m/gofundme/master/data/campaigns.csv')
  campaigns = clean_data().process(campaigns)

In [42]:
campaigns.head()

Unnamed: 0,category,page,title,start_date,goal,raised,text,likes,shares,photos,donation_count,duration,recent_donation_time,goal_reaeched_time,script_run_date,city,state
0,medical,1,kdafoos cancer,2018-07-18,500000,676152,vee fighting cancer past 5 years going nowhere...,24000,330,11,2408,1,0,1,2018-08-27,houston,tx
1,medical,1,92 yr old man brutally attacked,2018-07-06,15000,327345,july 4th around 7pm grandfather rodolfo rodrig...,12000,26000,4,12167,1,1,1,2018-08-27,los angeles,ca
2,medical,1,olivia stoy transplant liv,2018-05-18,10000000,316376,update many fundraisers put place raise funds ...,57000,12000,44,5601,3,0,3,2018-08-27,ashley,
3,medical,1,autologous cell transplant,2018-06-22,250000,241205,43 year old philip defonte husband father 3 yo...,839,18000,0,843,2,1,0,2018-08-27,staten island,ny
4,medical,1,claire wineland needs help,2018-06-12,225000,237555,first wow blown away amazing outpouring love s...,90000,65000,200,8432,2,0,1,2018-08-27,garden grove,ca


In [0]:
#os.mkdir('data')
#os.chdir('data')
campaigns.to_csv('campaigns_cleaned.csv'),index=False)