[View in Colaboratory](https://colab.research.google.com/github/gowtham91m/gofundme/blob/master/process.ipynb)

In [1]:
import pandas as pd
import re
import os
import shutil
import nltk
import numpy as np
from nltk import word_tokenize
nltk.download(['stopwords','wordnet','punkt'])
#nltk.download('punkt')
from nltk.corpus import stopwords
from google.colab import files
from nltk.stem import WordNetLemmatizer
from getpass import getpass
import subprocess

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [0]:
#campaigns = pd.read_csv('https://raw.githubusercontent.com/gowtham91m/gofundme/master/data/campaigns.csv')                        

#recent_donation = campaigns['recent_donation_time'].str.split(' ', expand=True).iloc[:,0:2]
#print(recent_donation.iloc[:,1].unique())

# update all rows to month level information, since we don't have day level informaiton after one month.
# anyting less than a month is 0 
# all the data will be in months, we can retain the numbers alone.
#recent_donation['months_since_last_donation'] = np.where(~recent_donation.iloc[:,1].isin(['months','month']), 0, recent_donation.iloc[:,0])
#ampaigns = campaigns.join(recent_donation['months_since_last_donation'])
#recent_donation.head()

In [0]:
# same transfromation will be applied to duration and goals_reached_time columns 

In [0]:
# putting everything together
class clean_data():
  def __init__(self):
    self.text_columns = ['category','title','city','state','text']
    self.num_cols_to_clean = ['goal','raised','likes','shares','donation_count']
    self.time_cols = ['duration','recent_donation_time','goal_reached_time']
    
  def str_to_num(self,x):
      x = re.sub('[,\$\.]','',x)
      x = re.sub('M','000000',x)
      x = re.sub('k','000',x)
      return x
    
  def remove_stop_words(self,x):
    lemma = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    x_words = word_tokenize(x)
    x_words = [lemma.lemmatize(i) for i in x_words if not i in stop_words]
    return ' '.join(x_words)
  
  def time_parser(self,x):
    x = x.str.split(' ',2, expand=True)
    return np.where(~x.iloc[:,1].isin(['months','month']), 0, x.iloc[:,0])

  def text_process(self,x):
    x = x.lower()
    # replace non alpha numerical characters with space character
    x = re.sub('[^a-z0-9]',' ',x)
    
    # replace multiple spaceses with single space 
    x = re.sub('(  +)', ' ', x)
    
    # remove stop words
    x = self.remove_stop_words(x)
    return x
  
  def process(self,df):
    df[self.num_cols_to_clean] = df[self.num_cols_to_clean].applymap(self.str_to_num)
    df = df.join(df['location'].str.split(',', 1, expand=True).rename(columns={0:'city', 1:'state'}))
    df['text_length'] = df.text.apply(lambda x: len(x))
    df[self.text_columns] = df[self.text_columns].applymap(self.text_process)
    df.drop(['location','href'],inplace=True,axis=1)
    
    df['start_date'] = pd.to_datetime(df.start_date)
    df['script_run_date'] = pd.to_datetime(df.script_run_date)
    
    # process the time range variables, duration (overa all campaign time), goal_reached_time, recent_donation_time
    df[self.time_cols] = df[self.time_cols].apply(self.time_parser)
    
    return df
  
  

class git_put:
  def __init__(self,root_dir = os.getcwd()):
      self.root = root_dir
      self.datapath = os.path.join(root_dir,'gofundme/data')
      self.project_dir = os.path.join(root_dir,'gofundme')
      self.git_repo = '@github.com/gowtham91m/gofundme.git'

  def git_clone(self):
    os.chdir(self.root)
    if 'gofindme' not in os.listdir(self.root):
      #gt = 'https://@github.com/gowtham91m/gofundme.git'
      subprocess.Popen(['git', 'clone', str('https://'+self.git_repo)])
    else:
      os.chdir(self.datapath)
      subprocess.Popen(['git','pull'])

  def git_push(self):
    os.chdir(self.project_dir)
    
    username = input('user name: ')
    password = getpass('password: ')

    !git config user.email "gowtham.91m@gmail.com"
    !git config user.name "Gowtham Mallikarjuna"
    !git add .
    !git commit -m "commit"
    #!git push -u origin "master"
    subprocess.Popen(['git','push','-u','https://'+username+':'+password+self.git_repo,'--all'])
    
  def put_cleaned_data(self,df):
    self.git_clone()
    df.to_csv(os.path.join(self.datapath,'campaigns_cleaned.csv'),index=False)
    
    self.git_push()


In [6]:
if __name__ == '__main__':
  os.chdir('/content')
  campaigns = pd.read_csv('https://raw.githubusercontent.com/gowtham91m/gofundme/master/data/campaigns.csv')
  cleaned_campaigns = clean_data().process(campaigns)
  os.chdir('/content')
  git_put().put_cleaned_data(cleaned_campaigns)

user name: gowtham91m
password: ··········
[master e8a6903] commit
 1 file changed, 1001 insertions(+), 1001 deletions(-)
 rewrite data/campaigns_cleaned.csv (82%)
