In [1]:
import pandas as pd
import numpy as np

In [2]:
newsArticles = pd.read_excel("reutersNLTK.xlsx")

In [3]:
newsArticles.head()

Unnamed: 0,ids,categories,text
0,test/14826,['trade'],ASIAN EXPORTERS FEAR DAMAGE FROM U.S.-JAPAN RI...
1,test/14828,['grain'],CHINA DAILY SAYS VERMIN EAT 7-12 PCT GRAIN STO...
2,test/14829,"['crude', 'nat-gas']",JAPAN TO REVISE LONG-TERM ENERGY DEMAND DOWNWA...
3,test/14832,"['corn', 'grain', 'rice', 'rubber', 'sugar', '...",THAI TRADE DEFICIT WIDENS IN FIRST QUARTER\n ...
4,test/14833,"['palm-oil', 'veg-oil']",INDONESIA SEES CPO PRICE RISING SHARPLY\n Ind...


In [4]:
import nltk

# Text Preprocessing routines:
1. Removal of special characters and accented characters
2. Remova lof case
3. Tokenize phrase
4. Stem, Lemmitize the tokens
5. Get POS tags of the words
7. n-grams
8. TD IDF vectors
# Text Exploration:
10. Collocations
11. Concordance
12. Word cloud
13. Word Frequency
14. Concordance
15. Similar
# Text Features:
16. Bag of words
17. N Grams
18. Morphological Parsing
19. Word to Vec
# Modeling:
20. Doc similarity
21. NER

# Processing

In [6]:
!pip install unidecode

Collecting unidecode
  Downloading https://files.pythonhosted.org/packages/31/39/53096f9217b057cb049fe872b7fc7ce799a1a89b76cf917d9639e7a558b5/Unidecode-1.0.23-py2.py3-none-any.whl (237kB)
Installing collected packages: unidecode
Successfully installed unidecode-1.0.23


In [9]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\gopin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [10]:
from nltk.corpus import stopwords
import re
import unidecode 

# Special chars removal
def fnRemoveSpecialChars(phrase):
    return(re.sub('[^A-Za-z\s]',' ',phrase))

# Accent removal
def fnRemoveAccent(phrase):
    return(unidecode.unidecode(phrase))

# Case removal
stops = stopwords.words("English")
def fnCaseRemoval(phrase):
    return(phrase.lower())

# Stopword removal
def fnRemoveStopWords(stopWords, txt):
    newtxt = ' '.join([word for word in txt.split() if word not in stopWords])
    return newtxt

# Tokenize
def fnTokenizePrases(phrase):
    return(nltk.word_tokenize(phrase))

# Stem
def fnStemWithProter(phrase):
    porter = nltk.PorterStemmer()
    words = fnTokenizePrases(phrase)
    newWords = [porter.stem(w) for w in words]
    newWords = " ".join(newWords)
    return newWords
from nltk.stem import WordNetLemmatizer

#nltk.download('wordnet')
# Lemmatization
def fnWordLemm(phrase):
    lemmatizer = WordNetLemmatizer()
    words = fnTokenizePrases(phrase)
    newWords = [lemmatizer.lemmatize(thisWord) for thisWord in words]
    newWords = " ".join(newWords)
    return newWords

# Pos tag
def fnPosTag(phrase):
    words = fnTokenizePrases(phrase)
    posTagslist = []
    for nWords in words:
        posTagslist.append(list(nltk.pos_tag([nWords])))
    return posTagslist

# Find nGrams
def fnGetNgrams(phrase,n):
    phraseSplitted = phrase.split()
    phraseLen = len(phraseSplitted)
    grams = []
    startIter = 0
    nIter = n
    while nIter<=phraseLen:
        if (n-1)>phraseLen:
            return("Reduce n to {}".format(phraseLen-1))
        #nIter = startIter + (n-1)
        thisGram = phraseSplitted[startIter:(nIter)]
        #print(phraseSplitted)
        thisGram = list(thisGram)
        grams.append(thisGram)
        startIter+=1
        nIter+=1
    return(grams)        

In [11]:
fnWordLemm("automobiles")

'automobile'

In [12]:
fnStemWithProter("automobiles")

'automobil'

# Prepare the corpus

In [13]:
newsArticles['cleanedPhrase'] = newsArticles['text'].apply(lambda x:fnRemoveSpecialChars(str(x)))
newsArticles['cleanedPhrase'] = newsArticles['cleanedPhrase'].apply(lambda x:re.sub(" +"," ",x))
newsArticles['cleanedPhrase'] = newsArticles['cleanedPhrase'].apply(lambda x:fnCaseRemoval(x))
stops = stopwords.words("English")
newsArticles['cleanedPhrase'] = newsArticles['cleanedPhrase'].apply(lambda x: fnRemoveStopWords(stops,x))

# Calculate TD IDF

In [14]:
newsArticles[["text","cleanedPhrase"]].head()

Unnamed: 0,text,cleanedPhrase
0,ASIAN EXPORTERS FEAR DAMAGE FROM U.S.-JAPAN RI...,asian exporters fear damage u japan rift mount...
1,CHINA DAILY SAYS VERMIN EAT 7-12 PCT GRAIN STO...,china daily says vermin eat pct grain stocks s...
2,JAPAN TO REVISE LONG-TERM ENERGY DEMAND DOWNWA...,japan revise long term energy demand downwards...
3,THAI TRADE DEFICIT WIDENS IN FIRST QUARTER\n ...,thai trade deficit widens first quarter thaila...
4,INDONESIA SEES CPO PRICE RISING SHARPLY\n Ind...,indonesia sees cpo price rising sharply indone...


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = list(newsArticles['cleanedPhrase'])
vectorizer = TfidfVectorizer(min_df=1)
X = vectorizer.fit_transform(newsArticles['cleanedPhrase'])

In [16]:
fitted = vectorizer.transform(newsArticles['cleanedPhrase'])

In [17]:
type(fitted)

scipy.sparse.csr.csr_matrix

In [18]:
print(fitted.toarray().shape)

(10788, 29023)


In [19]:
fitted2 = vectorizer.transform([newsArticles['cleanedPhrase'][0]])

In [20]:
len(vectorizer.get_feature_names())

29023

In [21]:
fitted2.toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

In [22]:
import gensim

In [23]:
# prepare data for Word Embeddings
trainData = []
for thisTrain in newsArticles['cleanedPhrase']:
    thisPhraseTokens = fnTokenizePrases(thisTrain)
    trainData.append(thisPhraseTokens)

In [24]:
from gensim.models import Word2Vec
model = Word2Vec(trainData, min_count=1,size = 10)

  "C extension not loaded, training will be slow. "


KeyboardInterrupt: 

In [None]:
model.train(trainData, total_examples=len(trainData), epochs=10)

In [None]:
model.wv.most_similar(positive = 'china')

In [None]:
model.wv.most_similar(positive = 'exporters')

In [None]:
model.wv.most_similar(positive = 'trade')

In [None]:
model.wv.most_similar(positive = 'stocks')

In [None]:
terms = model.wv.vocab
termsList = terms.keys()
wordVecs = model.wv.syn0

In [None]:
model.wv.vectors.shape

In [None]:
model.wv.syn0

In [None]:
model.predict_output_word("trade")

# Doc2Vec Training

In [None]:
from gensim.models import Doc2Vec
import gensim

In [None]:
len(trainData)

# Define the docs

In [None]:
docs = [gensim.models.doc2vec.TaggedDocument(words=token, tags=['DOC_' + str(idx)])
            for idx, token in enumerate(trainData)]

# Train model and create doc embeddings

In [None]:
min_count = 10
context_window = 10
vector_size = 1000
num_threads = 4
num_epochs = 25
modelDoc2Vec = gensim.models.Doc2Vec(docs, min_count=min_count, window=context_window, size=vector_size,
                                      workers=num_threads,
                                      iter=num_epochs)

# Create features

In [None]:
trainingSet = np.zeros((len(trainData),vector_size))
for i in range(len(trainData)):
    trainingSet[i,] = modelDoc2Vec.docvecs['DOC_'+str(i)]

In [None]:
trainingSet.view()

In [None]:
sims = modelDoc2Vec.docvecs.most_similar('DOC_1000')
print(sims)

In [None]:
" ".join(trainData[1000])

In [None]:
" ".join(trainData[5385])