# EX6: Article Spinner

In [3]:
import nltk
import random
import numpy as np
import pandas as pd

In [4]:
data = pd.read_table('NLP_Q6.txt',header=None,names='C')

In [16]:
trigrams = {}
for m in range(len(data) - 1):
    s = data['C'][m].lower()
    tokens = nltk.tokenize.word_tokenize(s)
    for i in range(len(tokens) - 2):
        k = (tokens[i], tokens[i+2])
        if k not in trigrams:
            trigrams[k] = []
        trigrams[k].append(tokens[i+1])

In [26]:
trigram_probabilities = {}
for k, words in trigrams.items():
    # create a dictionary of word -> count
    if len(set(words)) > 1:
        # only do this when there are different possibilities for a middle word
        d = {}
        n = 0
        for w in words:
            if w not in d:
                d[w] = 0
            d[w] += 1
            n += 1
        for w, c in d.items():
            d[w] = float(c) / n
        trigram_probabilities[k] = d

In [20]:
def random_sample(d):
    # choose a random sample from dictionary where values are the probabilities
    r = random.random()
    cumulative = 0
    for w, p in d.items():
        cumulative += p
        if r < cumulative:
            return w

In [21]:
def test_spinner(m):
    s = m.lower()
    print ("Original:", s)
    tokens = nltk.tokenize.word_tokenize(s)
    for i in range(len(tokens) - 2):
        if random.random() < 0.2: # 20% chance of replacement
            k = (tokens[i], tokens[i+2])
            if k in trigram_probabilities:
                w = random_sample(trigram_probabilities[k])
                tokens[i+1] = w
    print ("Spun:")
    print (" ".join(tokens).replace(" .", ".").replace(" '", "'").replace(" ,", ",").replace("$ ", "$").replace(" !", "!"))

In [23]:
test_spinner(data['C'][1])

('Original:', 'in the present world of cut-throat internet marketing, you need all the help you get. so why not cut down on the time needed to create new unique content? this is probably the most time consuming part of your website building process.')
Spun:
in the present world of cut-throat internet marketing, you need all the help you get. so why not cut down on the time needed to create new unique content ? this is probably the most time consuming part of your website building process.


# EX7: Latent Semantic Analysis

In [29]:
import sklearn
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import Normalizer
from sklearn import metrics
from sklearn.cluster import KMeans, MiniBatchKMeans
import pandas as pd

In [127]:
data1 = pd.read_table('NLP_Q7_1.txt',header=None,names='C',encoding='utf-8')
data2 = pd.read_table('NLP_Q7_2.txt',header=None,names='C',encoding='utf-8')
data3 = pd.read_table('NLP_Q7_3.txt',header=None,names='C',encoding='utf-8')
data4 = pd.read_table('NLP_Q7_4.txt',header=None,names='C',encoding='utf-8')
data5 = pd.read_table('NLP_Q7_5.txt',header=None,names='C',encoding='utf-8')
data6 = pd.read_table('NLP_Q7_6.txt',header=None,names='C',encoding='utf-8')
data7 = pd.read_table('NLP_Q7_7.txt',header=None,names='C',encoding='utf-8')
data8 = pd.read_table('NLP_Q7_8.txt',header=None,names='C',encoding='utf-8')

In [128]:
frames = [data1,data2,data3,data4,data5,data6,data7,data8]
data = pd.concat(frames,ignore_index=True)

In [129]:
data

Unnamed: 0,C
0,For all the dangers football poses to its play...
1,For all the dangers football poses to its play...
2,College football recruiters have their GPS dev...
3,The prospect of housing astronauts beneath the...
4,There are still a few challenges to be worked ...
5,"Nevertheless, if we manage to seal off a lunar..."
6,According to radar data from the agency's SELE...
7,Lava tubes formed billions of years ago when t...
8,President Donald Trump has made it known he w...
9,Trump is embracing a tried-and-true tactic bef...


In [130]:
Corpus =data.as_matrix(columns=None) 

In [131]:
vectorizer = CountVectorizer(min_df = 1, stop_words = 'english')


In [132]:
Final_Corpus= []


In [133]:
for i in range(len(Corpus)-1):
    Final_Corpus.append(Corpus[i][0])


In [142]:
dtm = vectorizer.fit_transform(Final_Corpus)

In [136]:
print(pd.DataFrame(dtm.toarray(),index=Final_Corpus,columns=vectorizer.get_feature_names()).head(10))

                                                    12  2014  300  330  37  \
For all the dangers football poses to its playe...   0     0    0    0   1   
For all the dangers football poses to its playe...   0     0    0    0   1   
College football recruiters have their GPS devi...   0     0    0    0   0   
The prospect of housing astronauts beneath the ...   0     1    0    0   0   
There are still a few challenges to be worked o...   0     0    0    0   0   
Nevertheless, if we manage to seal off a lunar ...   0     0    0    0   0   
According to radar data from the agency's SELEN...   0     0    1    1   0   
Lava tubes formed billions of years ago when th...   0     0    0    0   0   
 President Donald Trump has made it known he wo...   0     0    0    0   0   
Trump is embracing a tried-and-true tactic befo...   1     0    0    0   0   

                                                    according  active  \
For all the dangers football poses to its playe...          0       

In [137]:
dtm

<16x371 sparse matrix of type '<type 'numpy.int64'>'
	with 473 stored elements in Compressed Sparse Row format>

In [138]:
vectorizer.get_feature_names()

[u'12',
 u'2014',
 u'300',
 u'330',
 u'37',
 u'according',
 u'active',
 u'addition',
 u'adults',
 u'agency',
 u'ago',
 u'agriculture',
 u'air',
 u'allegations',
 u'alliance',
 u'announced',
 u'areas',
 u'argues',
 u'assailed',
 u'astronauts',
 u'athletics',
 u'attempted',
 u'base',
 u'based',
 u'battery',
 u'begin',
 u'behalf',
 u'beneath',
 u'billions',
 u'board',
 u'box',
 u'bring',
 u'broken',
 u'brought',
 u'busy',
 u'buzz',
 u'cabinet',
 u'called',
 u'calling',
 u'candidate',
 u'car',
 u'cargo',
 u'cars',
 u'cases',
 u'cave',
 u'caves',
 u'challenges',
 u'chamber',
 u'channels',
 u'chasm',
 u'chevrolet',
 u'child',
 u'china',
 u'closes',
 u'collapsed',
 u'college',
 u'colony',
 u'comfortably',
 u'comment',
 u'communities',
 u'compared',
 u'compartment',
 u'compartments',
 u'composed',
 u'concept',
 u'cone',
 u'configuration',
 u'constantly',
 u'contains',
 u'convenient',
 u'converted',
 u'corvair',
 u'dangers',
 u'data',
 u'dauphine',
 u'day',
 u'dec',
 u'decades',
 u'defended',
 

In [140]:
lsa = TruncatedSVD(2, algorithm = 'arpack')

In [None]:
dtm_lsa = lsa.fit_transform(dtm)

In [147]:
dtm_lsa = Normalizer(copy=False).fit_transform(dtm)

In [148]:
dtm_lsa

<16x371 sparse matrix of type '<type 'numpy.float64'>'
	with 473 stored elements in Compressed Sparse Row format>

In [153]:
print(pd.DataFrame(dtm_lsa, index = Final_Corpus, columns = ["component_1","component_2"]))

                                                                                          component_1  \
For all the dangers football poses to its playe...    (0, 4)\t0.123091490979\n  (0, 17)\t0.1230914...   
For all the dangers football poses to its playe...    (0, 4)\t0.123091490979\n  (0, 17)\t0.1230914...   
College football recruiters have their GPS devi...    (0, 4)\t0.123091490979\n  (0, 17)\t0.1230914...   
The prospect of housing astronauts beneath the ...    (0, 4)\t0.123091490979\n  (0, 17)\t0.1230914...   
There are still a few challenges to be worked o...    (0, 4)\t0.123091490979\n  (0, 17)\t0.1230914...   
Nevertheless, if we manage to seal off a lunar ...    (0, 4)\t0.123091490979\n  (0, 17)\t0.1230914...   
According to radar data from the agency's SELEN...    (0, 4)\t0.123091490979\n  (0, 17)\t0.1230914...   
Lava tubes formed billions of years ago when th...    (0, 4)\t0.123091490979\n  (0, 17)\t0.1230914...   
 President Donald Trump has made it known he wo...    (

In [None]:
similarity = np.asarray(np.asmatrix(dtm_lsa) * np.asmatrix(dtm_lsa).T)
pd.DataFrame(similarity,index=example, columns=example).head(10)

In [160]:
np.asmatrix(dtm_lsa)

matrix([[ <16x371 sparse matrix of type '<type 'numpy.float64'>'
 	with 473 stored elements in Compressed Sparse Row format>]], dtype=object)

In [172]:
np.asmatrix(dtm_lsa).T

matrix([[ <16x371 sparse matrix of type '<type 'numpy.float64'>'
 	with 473 stored elements in Compressed Sparse Row format>]], dtype=object)

# EX5: Negative and Positive Sentiment

In [180]:
senti_dict = {}
for each_line in open('Dict.txt'):
    word,score = each_line.split(',')
    senti_dict[word] = int(score)

In [181]:
senti_dict

{'annoy': -2,
 'annoyed': -2,
 'annoyingly ': -1,
 'awesome': 4,
 'beautiful': 2,
 'beautifully': 2,
 'beauty': 2,
 'best': 2,
 'better': 1,
 'excellent': 4,
 'good': 2,
 'hate': -2,
 'hatred': -3,
 'nasty': -2,
 'nice': 3,
 'supreme': 3,
 'wonderful': 1}

In [182]:
data = pd.read_table('NLP_Q5.txt',header=None,names='C')

In [191]:
SentenceArray = []
for X in range(len(data) -1):
    #sentence = word_tokenize(data['C'][X])
    sentence = data['C'][X].lower().split()
    SentenceArray.append(sentence)

In [193]:
SentenceArray[0]

['rose', 'is', 'beautiful.']

In [203]:
for X in range(len(SentenceArray) -1):
    print data['C'][X] 
    print(sum( senti_dict.get(word, 0) for word in SentenceArray[X]))

Rose is beautiful.
0
Place is nasty to stay.
-2
This is the beauty of this technique.
2
Concept is explained beautifully in this book.
2
He annoyed me.
-2
Its the supreme place to stay.
3
I hate this place.
-2
Dont annoy the customer.
-2
He has given nasty comments about his stay.
-2
