[View in Colaboratory](https://colab.research.google.com/github/gowtham91m/gofundme/blob/master/process.ipynb)

In [143]:
import pandas as pd
import re
import nltk
from nltk import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [0]:
class clean_data():
  def __init__(self):
    self.text_columns = ['category','title','city','state','text']
    self.num_cols_to_clean = ['goal','raised','likes','shares','donation_count']
    
  def str_to_num(self,x):
      x = re.sub('[,\$\.]','',x)
      x = re.sub('M','000000',x)
      x = re.sub('k','000',x)
      return x
    
  def remove_stop_words(self,x):
    stop_words = set(stopwords.words('english'))
    x_words = word_tokenize(x)
    x_words = [i for i in x_words if not i in stop_words]
    return ' '.join(x_words)

  def text_process(self,x):
    x = x.lower()
    # replace non alpha numerical characters with space character
    x = re.sub('[^a-z0-9]',' ',x)
    
    # replace multiple spaceses with single space 
    x = re.sub('(  +)', ' ', x)
    
    # remove stop words
    x = self.remove_stop_words(x)
    return x
  
  def process(self,df):
    df[self.num_cols_to_clean] = df[self.num_cols_to_clean].applymap(self.str_to_num)
    df = df.join(df['location'].str.split(',', 1, expand=True).rename(columns={0:'city', 1:'state'}))
    df[self.text_columns] = df[self.text_columns].applymap(self.text_process)
    df.drop(['location','href'],inplace=True,axis=1)
    
    return df

In [0]:
campaigns = pd.read_csv('https://raw.githubusercontent.com/gowtham91m/gofundme/master/data/medical.csv')
campaigns = clean_data().process(campaigns)

In [160]:
campaigns.head()

Unnamed: 0,category,page,title,start_date,goal,raised,text,likes,shares,photos,donation_count,duration,recent_donation_time,goal_reaeched_time,script_run_time,city,state
0,medical,1,kdafoos cancer,"July 18, 2018",500000,676152,vee fighting cancer past 5 years going nowhere...,24000,330,11,2408,1 month,15 days ago,1 month,2018-08-27,houston,tx
1,medical,1,92 yr old man brutally attacked,"July 6, 2018",15000,327345,july 4th around 7pm grandfather rodolfo rodrig...,12000,26000,4,12167,1 month,1 month ago,1 month,2018-08-27,los angeles,ca
2,medical,1,olivia stoy transplant liv,"May 18, 2018",10000000,316376,update many fundraisers put place raise funds ...,57000,12000,44,5601,3 months,1 day ago,3 months,2018-08-27,ashley,
3,medical,1,autologous cell transplant,"June 22, 2018",250000,241205,43 year old philip defonte husband father 3 yo...,839,18000,0,843,2 months,1 month ago,0,2018-08-27,staten island,ny
4,medical,1,claire wineland needs help,"June 12, 2018",225000,237555,first wow blown away amazing outpouring love s...,90000,65000,200,8432,2 months,23 hours ago,1 month,2018-08-27,garden grove,ca
