In [2]:
# Usual imports
import numpy as np
import pandas as pd
from tqdm import tqdm
import string
import matplotlib.pyplot as plt
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE
import concurrent.futures
import time
import pyLDAvis.sklearn
from pylab import bone, pcolor, colorbar, plot, show, rcParams, savefig
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
import os
'''
# Plotly based imports for visualization
from plotly import tools
import plotly.plotly as py
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.figure_factory as ff
'''
# spaCy based imports
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
!python -m spacy download en_core_web_lg

[+] Download and installation successful
You can now load the model via spacy.load('en_core_web_lg')


In [3]:
ngos = pd.read_csv('C:\\Users\\ddeto\\PycharmProjects\\AidlyAI\\Data\\berks_NGOs.csv')
ngos.head()

Unnamed: 0,ID,Name,Mission_Statement
0,0,YMCA OF READING & BERKS COUNTY,To put Judeo-Christian principles into practic...
1,1,SAFE BERKS,The mission of Safe Berks is to provide suppor...
2,2,SOUTH MOUNTAIN YMCA,The Mission of the South Mountain YMCA is to p...
3,3,BAYADA HOSPICE,BAYADA Home Health Care has a special purpose—...
4,4,BERKS AGRICULTURAL RESOURCE NETWORK,The Berks Agricultural Resource Network (B.A.R...


In [5]:
import texthero as hero

ngos['pca'] = (
   ngos['Mission_Statement']
   .pipe(hero.clean)
   .pipe(hero.tfidf)
   .pipe(hero.pca)
)

ngos['clean'] = (
   ngos['Mission_Statement']
   .pipe(hero.clean)
)

ngos['tfidf'] = (
   ngos['Mission_Statement']
   .pipe(hero.clean)
   .pipe(hero.tfidf)
)

ngos['kmeans_labels'] = (
    ngos['tfidf']
    .pipe(hero.kmeans, n_clusters=5)
    .astype(str)
)

ngos['tokenize'] = (
    ngos['Mission_Statement']
    .pipe(hero.fillna)
    .pipe(hero.lowercase)
    .pipe(hero.remove_punctuation)
    .pipe(hero.remove_whitespace)
    .pipe(hero.tokenize)
)

In [6]:
import pandas as pd
df = pd.read_csv('C:\\Users\\ddeto\\PycharmProjects\\AidlyAI\\Data\\berks_NGOs.csv')
corpus_text = '\n'.join(df[:50000]['Mission_Statement'])
sentences = corpus_text.split('\n')
sentences = [line.lower().split(' ') for line in sentences]

def clean(s):
    return [w.strip(',."!?:;()\'') for w in s]
sentences = [clean(s) for s in sentences if len(s) > 0]


In [9]:
words_from_df = []

for index, row in ngos.iterrows():
    words_from_df.append(row['tokenize'])

words_from_df

[['to',
  'put',
  'judeo',
  'christian',
  'principles',
  'into',
  'practice',
  'through',
  'programs',
  'that',
  'build',
  'healthy',
  'spirit',
  'mind',
  'and',
  'body',
  'for',
  'all'],
 ['the',
  'mission',
  'of',
  'safe',
  'berks',
  'is',
  'to',
  'provide',
  'supportive',
  'services',
  'for',
  'victims',
  'and',
  'eliminate',
  'domestic',
  'and',
  'sexual',
  'violence',
  'in',
  'berks',
  'county'],
 ['the',
  'mission',
  'of',
  'the',
  'south',
  'mountain',
  'ymca',
  'is',
  'to',
  'provide',
  'a',
  'camping',
  'experience',
  'educational',
  'and',
  'other',
  'programs',
  'that',
  'build',
  'personal',
  'leadership',
  'character',
  'and',
  'life',
  'skills',
  'while',
  'developing',
  'caring',
  'honesty',
  'respect',
  'and',
  'responsibility',
  'in',
  'young',
  'people',
  'families',
  'and',
  'communities'],
 ['bayada',
  'home',
  'health',
  'care',
  'has',
  'a',
  'special',
  'purpose—to',
  'help',
  'peop

In [14]:
from gensim.models import Word2Vec

model = Word2Vec(words_from_df, size=100, window=5, min_count=1, workers=4)

vectors = model.wv

model.wv.vocab

{'to': <gensim.models.keyedvectors.Vocab at 0x22df9d14d60>,
 'put': <gensim.models.keyedvectors.Vocab at 0x22df9d14dc0>,
 'judeo': <gensim.models.keyedvectors.Vocab at 0x22df9d14100>,
 'christian': <gensim.models.keyedvectors.Vocab at 0x22df9d14c70>,
 'principles': <gensim.models.keyedvectors.Vocab at 0x22df9d14df0>,
 'into': <gensim.models.keyedvectors.Vocab at 0x22df9d14f70>,
 'practice': <gensim.models.keyedvectors.Vocab at 0x22df9a50790>,
 'through': <gensim.models.keyedvectors.Vocab at 0x22df9a50160>,
 'programs': <gensim.models.keyedvectors.Vocab at 0x22df9a50c10>,
 'that': <gensim.models.keyedvectors.Vocab at 0x22df9a50970>,
 'build': <gensim.models.keyedvectors.Vocab at 0x22df9a50ee0>,
 'healthy': <gensim.models.keyedvectors.Vocab at 0x22df9a50f10>,
 'spirit': <gensim.models.keyedvectors.Vocab at 0x22df9a509d0>,
 'mind': <gensim.models.keyedvectors.Vocab at 0x22df9a50ac0>,
 'and': <gensim.models.keyedvectors.Vocab at 0x22df9a50580>,
 'body': <gensim.models.keyedvectors.Vocab at

In [19]:


vectors['education']

vectors.most_similar('women')





[('promote', 0.27459996938705444),
 ('unique', 0.264776349067688),
 ('enjoyment', 0.2593572735786438),
 ('efforts', 0.24668043851852417),
 ('judeo', 0.23361074924468994),
 ('cooperates', 0.21954983472824097),
 ('extend', 0.21038231253623962),
 ('ymca', 0.20822100341320038),
 ('are', 0.20215332508087158),
 ('foundation', 0.20075657963752747)]