In [1]:
# Usual imports
import numpy as np
import pandas as pd
from tqdm import tqdm
import string
import matplotlib.pyplot as plt
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE
import concurrent.futures
import time
import pyLDAvis.sklearn
from pylab import bone, pcolor, colorbar, plot, show, rcParams, savefig
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
import os
'''
# Plotly based imports for visualization
from plotly import tools
import plotly.plotly as py
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.figure_factory as ff
'''
# spaCy based imports
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
!python -m spacy download en_core_web_lg

[+] Download and installation successful
You can now load the model via spacy.load('en_core_web_lg')


In [2]:
ngos = pd.read_csv('C:\\Users\\ddeto\\PycharmProjects\\AidlyAI\\Data\\MISSION_CLEAN.csv')
ngos.head()

Unnamed: 0,EIN,NAME,TAXYR,FORMTYPE,OBJECTID,URL,F9_03_PC_NEW_PROG_CHECKBOX,F9_03_PC_SIG_CHANGE_CHECKBOX,F9_03_PC_TOTAL_PROG_EXP,F9_03_PZ_MISSION,...,Onethirdsupportgifts,Benefitofcollege,Privatefoundation508e,Seekingretroreinstatement,Seekingsec7reinstatement,Correctnessdeclaration,Signaturename,Signaturetitle,Signaturedate,NTEE
0,10716217,THE SAN FRANCISCO HOUSING ACCELERATOR,2016,990,2.01811e+17,https://s3.amazonaws.com/irs-form-990/20181135...,0,0,401061.0,THE CORPORATION'S SPECIFIC PURPOSE IS TO SUPPO...,...,1,0,0,1,0,1,TERESA YANGA,EXECUTIVE DIRECTOR,7/30/2015,L
1,10842551,CREEKSIDE ELEMENTARY PTO INC,2015,990,2.01622e+17,https://s3.amazonaws.com/irs-form-990/20162238...,FALSE,FALSE,,SUPPORT HARTLAND CREEKSIDE ELEMENTARY SCHOOL.,...,0,0,0,1,0,1,CARA SUKSI,TREASURER,9/23/2015,B
2,20792368,COVINGTON COMMUNITY SPORTS INC,2011,990,2.01242e+17,https://s3.amazonaws.com/irs-form-990/20124195...,FALSE,FALSE,182141.0,COMMUNITY SPORTS PROGRAMS.,...,1,0,0,1,0,1,SHAWN STEWART,VICE PRESIDENT,9/6/2016,N
3,43611860,METRO COMMUNITY DEVELOPMENT,2016,990,2.01713e+17,https://s3.amazonaws.com/irs-form-990/20171318...,0,0,139435.0,"TO RELIEVE THE POOR, THE DISTRESSED AND THE UN...",...,0,0,0,1,0,1,DELORES WILKERSON-JACOBS,BOARD PRESIDENT,12/10/2014,N
4,43611860,METRO COMMUNITY DEVELOPMENT,2015,990,2.01642e+17,https://s3.amazonaws.com/irs-form-990/20164162...,0,0,165035.0,"TO RELIEVE THE POOR, THE DISTRESSED AND THE UN...",...,0,0,0,1,0,1,DELORES WILKERSON-JACOBS,BOARD PRESIDENT,12/10/2014,N


In [3]:
import texthero as hero

ngos['pca'] = (
   ngos['F9_03_PZ_MISSION']
   .pipe(hero.clean)
   .pipe(hero.tfidf)
   .pipe(hero.pca)
)

ngos['clean'] = (
   ngos['F9_03_PZ_MISSION']
   .pipe(hero.clean)
)

ngos['tfidf'] = (
   ngos['F9_03_PZ_MISSION']
   .pipe(hero.clean)
   .pipe(hero.tfidf)
)

ngos['kmeans_labels'] = (
    ngos['tfidf']
    .pipe(hero.kmeans, n_clusters=5)
    .astype(str)
)

ngos['tokenize'] = (
    ngos['F9_03_PZ_MISSION']
    .pipe(hero.fillna)
    .pipe(hero.lowercase)
    .pipe(hero.remove_punctuation)
    .pipe(hero.remove_whitespace)
    .pipe(hero.tokenize)
)

In [4]:
import pandas as pd
df = pd.read_csv('C:\\Users\\ddeto\\PycharmProjects\\AidlyAI\\Data\\berks_NGOs.csv')
corpus_text = '\n'.join(df[:50000]['Mission_Statement'])
sentences = corpus_text.split('\n')
sentences = [line.lower().split(' ') for line in sentences]

def clean(s):
    return [w.strip(',."!?:;()\'') for w in s]
sentences = [clean(s) for s in sentences if len(s) > 0]


In [5]:
words_from_df = []

for index, row in ngos.iterrows():
    words_from_df.append(row['tokenize'])

words_from_df

[['the',
  'corporation',
  's',
  'specific',
  'purpose',
  'is',
  'to',
  'supports',
  'affordable',
  'housing',
  'community',
  'development',
  'and',
  'economic',
  'development',
  'of',
  'the',
  'city',
  'and',
  'county',
  'of',
  'san',
  'francisco',
  's',
  'economically',
  'disadvantaged',
  'individuals',
  'and',
  'communities',
  'by',
  'lending',
  'to',
  'investing',
  'in',
  'and',
  'directly',
  'acquiring',
  'such',
  'affordable',
  'housing',
  'and',
  'related',
  'community',
  'development',
  'real',
  'estate',
  'assets'],
 ['support', 'hartland', 'creekside', 'elementary', 'school'],
 ['community', 'sports', 'programs'],
 ['to',
  'relieve',
  'the',
  'poor',
  'the',
  'distressed',
  'and',
  'the',
  'underprivilaged',
  'and',
  'combat',
  'community',
  'deterioration',
  'in',
  'the',
  'buffalo',
  'niagara',
  'region',
  'of',
  'new',
  'york',
  'state',
  'by',
  'establishing',
  'and',
  'maintaining',
  'a',
  'faith',
 

In [9]:
from gensim.models import Word2Vec

model = Word2Vec(words_from_df, size=100, window=5, min_count=1, workers=4)

vectors = model.wv

len(model.wv.vocab)

6533

In [10]:


vectors['education']

vectors.most_similar('soccer')





[('based', 0.999845027923584),
 ('arts', 0.9998424053192139),
 ('club', 0.999841570854187),
 ('its', 0.9998363256454468),
 ('maintain', 0.9998341798782349),
 ('county', 0.9998312592506409),
 ('schools', 0.9998174905776978),
 ('sports', 0.9998170733451843),
 ('art', 0.9998161196708679),
 ('society', 0.999815821647644)]