In [24]:
import nltk
import networkx as nx

In [25]:
import itertools

In [26]:
# custom, written code
from NLP_nltk import unique_everseen, levenshtein_distance

# Twitter samples

In [27]:
from nltk.corpus import twitter_samples

In [6]:
nltk.download()

showing info http://www.nltk.org/nltk_data/


True

In [5]:
twitter_samples.fileids()

[u'negative_tweets.json',
 u'positive_tweets.json',
 u'tweets.20150430-223406.json']

In [61]:
import json

In [62]:
tweets2015tweets = twitter_samples.strings('tweets.20150430-223406.json')

In [63]:
tweets2015tweets[:25]

[u'RT @KirkKus: Indirect cost of the UK being in the EU is estimated to be costing Britain \xa3170 billion per year! #BetterOffOut #UKIP',
 u'VIDEO: Sturgeon on post-election deals http://t.co/BTJwrpbmOY',
 u'RT @LabourEoin: The economy was growing 3 times faster on the day David Cameron became Prime Minister than it is today.. #BBCqt http://t.co\u2026',
 u'RT @GregLauder: the UKIP east lothian candidate looks about 16 and still has an msn addy http://t.co/7eIU0c5Fm1',
 u"RT @thesundaypeople: UKIP's housing spokesman rakes in \xa3800k in housing benefit from migrants.  http://t.co/GVwb9Rcb4w http://t.co/c1AZxcLh\u2026",
 u'RT @Nigel_Farage: Make sure you tune in to #AskNigelFarage tonight on BBC 1 at 22:50! #UKIP http://t.co/ogHSc2Rsr2',
 u'RT @joannetallis: Ed Milliband is an embarrassment. Would you want him representing the UK?!  #bbcqt vote @Conservatives',
 u"RT @abstex: The FT is backing the Tories. On an unrelated note, here's a photo of FT leader writer Jonathan Ford (next to B

In [28]:
negativetweetstokens = twitter_samples.tokenized('negative_tweets.json')
positivetweetstokens = twitter_samples.tokenized('positive_tweets.json')
tweets2015tokens = twitter_samples.tokenized('tweets.20150430-223406.json')

In [29]:
negativetweetstagged=nltk.pos_tag_sents(negativetweetstokens)
positivetweetstagged=nltk.pos_tag_sents(positivetweetstokens)
tweets2015tagged=nltk.pos_tag_sents(tweets2015tokens)

In [30]:
defaulttags = ['NN','JJ','NNP','CD','NNS','VBN']

In [31]:
negativetweetsfiltered=[[item for item in sentence if item[1] in defaulttags] for sentence in negativetweetstagged]
positivetweetsfiltered=[[item for item in sentence if item[1] in defaulttags] for sentence in positivetweetstagged]
tweets2015filtered=[[item for item in sentence if item[1] in defaulttags] for sentence in tweets2015tagged]


## From my own defined Python function, `unique_everseen`, doing further preprocessing (i.e. data wrangling, data cleaning) 

In [32]:
negativetweetswordsonly=[list(unique_everseen([item[0] for item in tweet])) for tweet in negativetweetsfiltered]
positivetweetswordsonly=[list(unique_everseen([item[0] for item in tweet])) for tweet in positivetweetsfiltered]
tweets2015wordsonly=[list(unique_everseen([item[0] for item in tweet])) for tweet in tweets2015filtered]


In [33]:
negativewords=list( set([word for tweet in negativetweetswordsonly for word in tweet]) )
positivewords=list( set([word for tweet in positivetweetswordsonly for word in tweet]) )
tweets2015words = list( set([word for tweet in tweets2015wordsonly for word in tweet]) )


In [35]:
len(tweets2015words)

19902

These are truly large datasets and when we will need to compute all possible pairs, $10^8$ computations will be needed.  Let's try a smaller dataset(s).  

In [34]:
negativetweetswordsonly_small=negativetweetswordsonly[:500]
positivetweetswordsonly_small=positivetweetswordsonly[:500]
tweets2015wordsonly_small=tweets2015wordsonly[:500]

In [35]:
negativewords_small=list( set([word for tweet in negativetweetswordsonly_small for word in tweet]) )
positivewords_small=list( set([word for tweet in positivetweetswordsonly_small for word in tweet]) )
tweets2015words_small = list( set([word for tweet in tweets2015wordsonly_small for word in tweet]) )

In [36]:
set( tweets2015words ) == set( negativewords )

False

In [None]:
len(negativewords_small)

In [37]:
tweets2015wordsonly == positivetweetswordsonly

False

# Levenshtein Distance 

In [19]:
levenshtein_distance("thesame","thesame")

0

In [18]:
levenshtein_distance("positive","negative")

4

In [20]:
print( levenshtein_distance("#thesame","thesame") ); 
print( levenshtein_distance("#thesame","#thesame"))

1
0


## Graph Construction

In [36]:
import networkx as nx

In [37]:
import numpy as np
import scipy

In [38]:
negativetweetsgr=nx.Graph()
positivetweetsgr=nx.Graph()
tweets2015gr=nx.Graph()

In [39]:
negativetweetsgr.add_nodes_from(negativewords)
positivetweetsgr.add_nodes_from(positivewords)
tweets2015gr.add_nodes_from(tweets2015words)

In [41]:
for pair in list(itertools.combinations(negativetweetswordsonly,2) ):
    for wordpair in list(itertools.product(pair[0],pair[1])):
        firstString  = wordpair[0] 
        secondString = wordpair[1]
        levDistance = levenshtein_distance(firstString,secondString)
        negativetweetsgr.add_edge(firstString,secondString,weight=levDistance)

5000

Encapsulate all of this into a Python function: 

In [40]:
def build_edges_graph(V,tweets):
    """
    build_edges_graph
    
    INPUT(S)/ARGUMENTS
    ==================
    V := (Python list of) (unique) vertices, V
    tweets := (Python list of) tweets, from which we construct edges
    """
    # Initialize an undirected graph 
    gr = nx.Graph() 
    gr.add_nodes_from(V)

    for pair in list(itertools.combinations(tweets,2)):
        for wordpair in list(itertools.product(pair[0],pair[1])):
            firstString  = wordpair[0]
            secondString = wordpair[1]
            levDistance = levenshtein_distance(firstString,secondString)
            gr.add_edge(firstString,secondString,weight=levDistance)

    return gr
    

In [15]:
print(len(negativewords))
print(len(negativetweetswordsonly))

10632
5000


In [None]:
print(len)

In [None]:
negativetweetsgr_small = build_edges_graph(negativewords_small,negativetweetswordsonly_small)  # 5 minutes on Dell 15 inch Inspiron 7000 Gaming, 2017 


In [None]:
positivetweetsgr_small = build_edges_graph(positivewords_small,positivetweetswordsonly_small)


In [41]:
tweets2015gr_small = build_edges_graph(tweets2015words_small,tweets2015wordsonly_small)

cf. [Converting to and from other data formats, NetworkX](https://networkx.github.io/documentation/networkx-1.10/reference/convert.html)

In [20]:
nx.to_scipy_sparse_matrix(negativetweetsgr_small)

<397x397 sparse matrix of type '<type 'numpy.int64'>'
	with 155583 stored elements in Compressed Sparse Row format>

Checking to see that the number of nonzero entries, the stored elements in a Compressed Sparse Row format, is much greater than 0.5 relative to the total size of the matrix (number of columns * number of rows), we'll need to do dense math.  

In [21]:
397*397

157609

In [18]:
print(tweets2015gr_small.size())
print(tweets2015gr_small.number_of_nodes())
print(2018**2)

2032904
2018
4072324


Make the graph be one suitable for a stochastic matrix:  

In [35]:
tweets2015stochasticgr_small = nx.stochastic_graph( tweets2015gr_small.to_directed(), weight="weight") # 2 mins.

In [36]:
tweets2015stochasticnp_small = nx.to_numpy_matrix(tweets2015stochasticgr_small)

In [37]:
tweets2015stochasticnp_small;

matrix([[ 0.        ,  0.00036346,  0.00024231, ...,  0.00024231,
          0.00036346,  0.00036346],
        [ 0.00039329,  0.        ,  0.00032774, ...,  0.00039329,
          0.00032774,  0.00026219],
        [ 0.00025203,  0.00031504,  0.        , ...,  0.00031504,
          0.00031504,  0.00031504],
        ..., 
        [ 0.00024105,  0.00036158,  0.00030131, ...,  0.        ,
          0.00036158,  0.00036158],
        [ 0.00039252,  0.0003271 ,  0.0003271 , ...,  0.00039252,
          0.        ,  0.0003271 ],
        [ 0.00038605,  0.00025737,  0.00032171, ...,  0.00038605,
          0.00032171,  0.        ]])

In [46]:
for ele in tweets2015stochasticnp_small[5]:
    print ele

[[ 0.00039971  0.00026647  0.00033309 ...,  0.00039971  0.00019985
   0.00026647]]


## Ways to store large numpy arrays 

In [25]:
import h5py

In [26]:
tweets2015gr_smallHDF = h5py.File('tweets2015gr_small.h5','w')

In [27]:
tweets2015gr_smallHDF.create_dataset('A',data=nx.to_numpy_matrix( tweets2015gr_small))

<HDF5 dataset "A": shape (2018, 2018), type "<f8">

In [28]:
tweets2015gr_smallHDF.close()

In [38]:
tweets2015stochasticgr_smallHDF = h5py.File('tweets2015stochasticgr_small.h5','w')
tweets2015stochasticgr_smallHDF.create_dataset('A',data=tweets2015stochasticnp_small )

<HDF5 dataset "A": shape (2018, 2018), type "<f8">

In [40]:
tweets2015stochasticgr_smallHDF.close()

In [29]:
tweets2015np_small= nx.to_numpy_matrix(tweets2015gr_small)

In [33]:
tweets2015np_small.size

4072324

In [41]:
tweets2015stochasticnp_small.savetxt("tweets2015stochasticgr_small.txt")

AttributeError: 'matrix' object has no attribute 'savetxt'

In [45]:
np.savetxt("tweets2015stochasticgr_small.txt", tweets2015stochasticnp_small )

In [21]:
import pandas as pd

In [22]:
tweets2015gr_smallHDF = pd.HDFStore("tweets2015gr_small.hdf")

In [24]:
tweets2015gr_smallHDF.append("A", pd.DataFrame(nx.to_numpy_matrix( tweets2015gr_small ) ) )

HDF5ExtError: HDF5 error back trace

  File "../../../src/H5A.c", line 259, in H5Acreate2
    unable to create attribute
  File "../../../src/H5Aint.c", line 275, in H5A_create
    unable to create attribute in object header
  File "../../../src/H5Oattribute.c", line 347, in H5O_attr_create
    unable to create new attribute in header
  File "../../../src/H5Omessage.c", line 224, in H5O_msg_append_real
    unable to create new message
  File "../../../src/H5Omessage.c", line 1945, in H5O_msg_alloc
    unable to allocate space for message
  File "../../../src/H5Oalloc.c", line 1142, in H5O_alloc
    object header message is too large

End of HDF5 error back trace

Can't set attribute 'non_index_axes' in node:
 /A (Group) ''.

### Save the nodes of the graphs

In [12]:
import pickle

In [51]:
f=open("tweets2015grnodes_small.pkl","wb")
pickle.dump(tweets2015gr_small.nodes(),f)

In [52]:
len( tweets2015gr_small.nodes() )

2018

In [23]:
negative_nparr = nx.to_numpy_matrix(negativetweetsgr_small)

In [24]:
positive_nparr = nx.to_numpy_matrix(positivetweetsgr_small)

In [25]:
tweets2015_nparr = nx.to_numpy_matrix(tweets2015gr_small)

In [None]:
tweets2015gr_small

In [28]:
print(negative_nparr.size)
print(positive_nparr.size)
print(tweets2015_nparr.size)



157609
210681
393129


In [24]:
negativetweetsgr_trial.edges();
print(negativetweetsgr_trial.number_of_edges());
print(negativetweetsgr_trial.number_of_nodes());
negativetweetsgr_trial.size()

1318
10632


1318

### Doing pagerank with networkx

In [None]:
tweets2015_pagerank = nx.pagerank(tweets2015gr_small,weight="weight")

I found that this caused my kernel to freeze.  My system is a 2017 Dell 15 in. Inspiron 7000 Gaming, with 8 GB RAM.  

# CUDA C/C++ CUBLAS to the rescue: 

After running `main_pagerank.exe` (this was made by compiling with this command)   

```    
nvcc -std=c++11 -arch='sm_61' -lcublas main_pagerank.cu pagerank.cu -o main_pagerank.exe    
```   


In [1]:
pagerankresultfile = open("Pagerankresult.txt",'r')

In [2]:
pagerankresultlist = []
for line in pagerankresultfile:
    pagerankresultlist.append(line)

In [5]:
pagerankresultlist = pagerankresultlist[0].split()

In [10]:
print(len(pagerankresultlist))
pagerankresultlist = [float(ele) for ele in pagerankresultlist]
print(sum(pagerankresultlist))

2018
1.000001126


In [22]:
import cPickle as pickle

In [42]:
tweets2015grnodes_smallpkl = pickle.load(open("tweets2015grnodes_small.pkl",'rb'))

EOFError: 

In [14]:
tweets2015grnodes_smallfile = open("tweets2015grnodes_small.pkl",'rb')

In [15]:
tweets2015grnodes_smalllist= []
for line in tweets2015grnodes_smallfile:
    tweets2015grnodes_smalllist.append(line)

In [16]:
print(len(tweets2015grnodes_smalllist))

3945


In [21]:
tweets2015grnodes_smalllist[:100]

['(lp0\n',
 'Vyellow\n',
 'p1\n',
 'aVfour\n',
 'p2\n',
 'aVknell\n',
 'p3\n',
 'aVhttp://t.co/ShXTCyJSgw\n',
 'p4\n',
 "aVtonight's\n",
 'p5\n',
 'aVvotes\n',
 'p6\n',
 'aVvoter\n',
 'p7\n',
 'aVWatch\n',
 'p8\n',
 'aVpost-election\n',
 'p9\n',
 'aVvoted\n',
 'p10\n',
 'aVwhatsoever\n',
 'p11\n',
 'aVsorry\n',
 'p12\n',
 'aVeditorship\n',
 'p13\n',
 'aVworth\n',
 'p14\n',
 'aV@SkyNews\n',
 'p15\n',
 'aVClegg\n',
 'p16\n',
 'aVGE\n',
 'p17\n',
 'aVbringing\n',
 'p18\n',
 'aVvast\n',
 'p19\n',
 "aVwe'll\n",
 'p20\n',
 'aV@Independent\n',
 'p21\n',
 'aV@NursieDear25\n',
 'p22\n',
 'aVcalled\n',
 'p23\n',
 'aVunrelated\n',
 'p24\n',
 'aVred\n',
 'p25\n',
 'aVSNP\n',
 'p26\n',
 'aVLeanne\n',
 'p27\n',
 'aVforce\n',
 'p28\n',
 'aVleaders\n',
 'p29\n',
 'aVtired\n',
 'p30\n',
 'aVhttp://t.co/PjP3yb5u6t\n',
 'p31\n',
 'aV@mark_stuart10\n',
 'p32\n',
 'aVhttps://t.co/k4NMrenulf\n',
 'p33\n',
 'aVpunning\n',
 'p34\n',
 'aVbudget\n',
 'p35\n',
 'aVsecond\n',
 'p36\n',
 'aV7th\n',
 'p37\n',
 'aVe

In [43]:
tweets2015gr_small.nodes()[:100]

[u'yellow',
 u'four',
 u'knell',
 u'http://t.co/ShXTCyJSgw',
 u"tonight's",
 u'votes',
 u'voter',
 u'Watch',
 u'post-election',
 u'voted',
 u'whatsoever',
 u'sorry',
 u'editorship',
 u'worth',
 u'@SkyNews',
 u'Clegg',
 u'GE',
 u'bringing',
 u'vast',
 u"we'll",
 u'@Independent',
 u'@NursieDear25',
 u'called',
 u'unrelated',
 u'red',
 u'SNP',
 u'Leanne',
 u'force',
 u'leaders',
 u'tired',
 u'http://t.co/PjP3yb5u6t',
 u'@mark_stuart10',
 u'https://t.co/k4NMrenulf',
 u'punning',
 u'budget',
 u'second',
 u'7th',
 u'estimated',
 u'blue',
 u'@LibDem',
 u'Kilmarnock',
 u'+',
 u'#torycull',
 u"Cameron's",
 u'spokesman',
 u'@jamesosh',
 u'new',
 u'Presidential',
 u'@bratdha',
 u'men',
 u'nowt',
 u'becm',
 u'GRN',
 u'100',
 u'celebration',
 u'@scottieh419',
 u'kids',
 u'rests',
 u'muddying',
 u'reports',
 u'NOT',
 u'aka',
 u"someone's",
 u'changes',
 u'Bingo',
 u'plea',
 u'campaign',
 u"DON'T",
 u'@Socialist',
 u'FUCK',
 u'foodbanks',
 u'Book',
 u'Wales',
 u'total',
 u'http://t.co/ZgZbSwnZxZ',
 u

So if pickle/pickling fails, then rebuilding the graph with its node, and assuming that the order of nodes is the same as before, 

In [44]:
print(len(tweets2015gr_small.nodes()))

2018


In [45]:
pageranktweets2015results = zip(tweets2015gr_small.nodes(), pagerankresultlist)

In [53]:
list( reversed( sorted(pageranktweets2015results, key=lambda x: x[1])[-75:]  ) )

[(u'May', 0.00107337),
 (u'sketch', 0.00106792),
 (u'poverty', 0.00106506),
 (u'Carswell', 0.00106405),
 (u'favours', 0.00106181),
 (u'Strong', 0.00106171),
 (u'yourself', 0.00106052),
 (u'SNP.Tim', 0.00106006),
 (u'Liam', 0.00105989),
 (u'@TheMockneyRebel', 0.00105888),
 (u'@jreynoldsMP', 0.00105595),
 (u'turns', 0.00105517),
 (u'wonder', 0.0010536),
 (u'Worked', 0.00105347),
 (u'Voting', 0.00105171),
 (u"Here's", 0.00105133),
 (u'37', 0.00104981),
 (u'Vine', 0.00104812),
 (u'various', 0.0010408),
 (u'@mark_stuart10', 0.00103613),
 (u"Labour's", 0.00103214),
 (u'time', 0.00103193),
 (u'http://t.co/4\u2026', 0.0010311),
 (u'banks', 0.0010308),
 (u'#r4today', 0.00103022),
 (u'#VoteLabour', 0.00103017),
 (u'chances', 0.00102981),
 (u'@DonaldLiddell', 0.0010289),
 (u'#cullthetories', 0.00102863),
 (u'Dave', 0.00102825),
 (u'history', 0.0010282),
 (u'http://t\u2026', 0.0010281),
 (u'overspent', 0.00102767),
 (u'@ThimbleOfGrace', 0.00102698),
 (u'http://t.co/7eIU0c5Fm1', 0.00102615),
 (u'@H

In [20]:
dir(negativetweetsgr_trial)

['__class__',
 '__contains__',
 '__delattr__',
 '__dict__',
 '__doc__',
 '__format__',
 '__getattribute__',
 '__getitem__',
 '__hash__',
 '__init__',
 '__iter__',
 '__len__',
 '__module__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'add_cycle',
 'add_edge',
 'add_edges_from',
 'add_node',
 'add_nodes_from',
 'add_path',
 'add_star',
 'add_weighted_edges_from',
 'adj',
 'adjacency_iter',
 'adjacency_list',
 'adjlist_dict_factory',
 'clear',
 'copy',
 'degree',
 'degree_iter',
 'edge',
 'edge_attr_dict_factory',
 'edges',
 'edges_iter',
 'get_edge_data',
 'graph',
 'has_edge',
 'has_node',
 'is_directed',
 'is_multigraph',
 'name',
 'nbunch_iter',
 'neighbors',
 'neighbors_iter',
 'node',
 'node_dict_factory',
 'nodes',
 'nodes_iter',
 'nodes_with_selfloops',
 'number_of_edges',
 'number_of_nodes',
 'number_of_selfloops',
 'order',
 'remove_edge',
 'remove_edges_from',
 'remove_node',
 'remov

In [None]:
dir(nx)

In [None]:
negativetweetsgr = build_edges_graph(negativewords, negativetweetswordsonly)

In [47]:
for wordpair in list(itertools.product(pair2[0], pair2[1] ) ):
    print( levenshtein_distance( wordpair[0], wordpair[1]))

6
8
4
3
5
0
