对1月27日的dat数据进行清洗和描述

In [2]:
with open("/Users/YOLLY/desktop/cjc/tweet0127.dat", \
          'rb') as f:
    tweets1 = f.readlines()
In [3]:
len(tweets1)
Out[3]:
11993
In [4]:
varNames = tweets1[0].replace('\n', '').split(',')
varNames
Out[4]:
['Twitter ID',
 'Text',
 'Profile Image URL',
 'Day',
 'Hour',
 'Minute',
 'Created At',
 'Geo',
 'From User',
 'From User ID',
 'Language',
 'To User',
 'To User ID',
 'Source     ']
In [5]:
with open("/Users/YOLLY/desktop/cjc/tweet0127.dat", 'w') as f:
    right_line = '' 
    blocks = [] 
    for line in tweets1:
        right_line += line.replace('\n', ' ')
        line_length = len(right_line.split(','))
        if line_length >= 14:
            blocks.append(right_line)
            right_line = ''
    for i in blocks:
        f.write(i + '\n')
In [6]:
len(blocks)
Out[6]:
11993
In [14]:
with open("/Users/YOLLY/desktop/cjc/tweet0127.dat", 'w') as f:
    right_line = '' 
    blocks = [] 
    for line in tweets1:
        right_line += line.replace('\n', ' ').replace('\r', ' ')
        line_length = len(right_line.split(','))
        if line_length >= 14:
            blocks.append(right_line)
            right_line = ''
    for i in blocks:
        f.write(i + '\n')
In [7]:
with open("/Users/YOLLY/desktop/cjc/tweet0127.dat", 'rb') as f:
    chunk1 = f.readlines()
In [8]:
len(chunk1)
Out[8]:
11993
In [9]:
chunk1[:3]
Out[9]:
['Twitter ID,Text,Profile Image URL,Day,Hour,Minute,Created At,Geo,From User,From User ID,Language,To User,To User ID,Source      \n',
 " 6263754,162686269481156608,The 27 Republican Bills That Aren't About Jobs  http://t.co/fhcVmqwR  #ows #occupy #opdx,http://a3.twimg.com/profile_images/1739660175/african.violet.5_normal.jpg,2012-01-27,0,0,2012-01-27 00:00:03,N;,zengreen,170208476,en,,0,<a href="http://www.tweetdeck.com" rel="nofollow">TweetDeck</a>      \n",
 ' 6263755,162686280986140672,"For those interested in #FinancialFriday, the weekly bank march in San Jose, here\'s more info: http://t.co/cnKv1UJz #OO #OSJ #OSF #OWS",http://a1.twimg.com/profile_images/1751422937/DSC06020_normal.jpg,2012-01-27,0,0,2012-01-27 00:00:06,N;,OccupyManJose,386911819,en,,0,<a href="http://twitter.com/">web</a>      \n']
In [10]:
import csv
clean_lines1 = (line.replace('\x00','') \
               for line in chunk1[1:])
lines1 = csv.reader(clean_lines1, delimiter=',', \
                   quotechar='"')
In [11]:
import pandas as pd

df1 = pd.read_csv("/Users/YOLLY/desktop/cjc/tweet0127.dat",\
                 sep = ',', quotechar='"')
df1[:3]
Out[11]:
Twitter ID Text Profile Image URL Day Hour Minute Created At Geo From User From User ID Language To User To User ID Source
6263754 162686269481156608 The 27 Republican Bills That Aren't About Jobs... http://a3.twimg.com/profile_images/1739660175/... 2012-01-27 0 0 2012-01-27 00:00:03 N; zengreen 170208476 en NaN 0 <a href="http://www.tweetdeck.com&quot...
6263755 162686280986140672 For those interested in #FinancialFriday, the ... http://a1.twimg.com/profile_images/1751422937/... 2012-01-27 0 0 2012-01-27 00:00:06 N; OccupyManJose 386911819 en NaN 0 <a href="http://twitter.com/">...
6263756 162686293048954880 The 27 Republican Bills That Aren't About Jobs... http://a0.twimg.com/profile_images/1209829084/... 2012-01-27 0 0 2012-01-27 00:00:09 N; dohlink 21453088 en NaN 0 <a href="http://www.tweetdeck.com&quot...
In [26]:
df1.Text[6263754]
Out[26]:
"The 27 Republican Bills That Aren't About Jobs  http://t.co/fhcVmqwR  #ows #occupy #opdx"
In [42]:
from collections import defaultdict
data_dict = defaultdict(int)
line_num = 0
lines = csv.reader((line.replace('\x00','') for line in chunk1[1:]), delimiter=',', quotechar='"')
for i in lines:
    line_num +=1
    data_dict[i[9]] +=1
In [13]:
import csv
lines = csv.reader((line.replace('\x00','') for line in chunk1[1:]), delimiter=',', quotechar='"')
users = [i[9] for i in lines]
In [14]:
lines = csv.reader((line.replace('\x00','') for line in chunk1[1:]), delimiter=',', quotechar='"')
tweets =[i[2] for i in lines]
In [15]:
a=[]
for i in range(len(users)):
    a.append([users[i],tweets[i]])
    i=i+1
In [16]:
import pandas as pd
dat = pd.DataFrame(data=a, columns = ['users', 'tweets'])
In [17]:
dat.describe()
Out[17]:
users tweets
count 11992 11992
unique 4758 9442
top owsar RT @OccupyWallStNYC: “The duty of a patriot is...
freq 469 89
In [19]:
%matplotlib inline
from matplotlib.font_manager import FontProperties
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rc("savefig", dpi=100)
font = FontProperties(fname=r'/Users/YOLLY/desktop/cjc/msyh.ttf', size=14)
In [90]:
tweet_dict = defaultdict(int)
for i in data_dict.values():
    tweet_dict[i] += 1
 
plt.loglog(tweet_dict.keys(), tweet_dict.values(), 'co',linewidth=2)  
plt.xlim((0, 1000))
plt.xlabel(u'推特数', fontproperties=font)
plt.ylabel(u'人数', fontproperties=font )
plt.show()
In [130]:
import numpy as np
import statsmodels.api as sm

def powerPlot(d_value, d_freq, color, marker):
    d_freq = [i + 1 for i in d_freq]
    d_prob = [float(i)/sum(d_freq) for i in d_freq]
    #d_rank = ss.rankdata(d_value).astype(int)
    x = np.log(d_value)
    y = np.log(d_prob)
    xx = sm.add_constant(x, prepend=True)
    res = sm.OLS(y,xx).fit()
    constant,beta = res.params
    r2 = res.rsquared
    plt.plot(d_value, d_prob, linestyle = '',  color = color, marker = marker)
    plt.plot(d_value, np.exp(constant+x*beta),"pink")
    plt.xscale('log'); plt.yscale('log')
    plt.text(max(d_value)/2,max(d_prob)/10,
             r'$\beta$ = ' + str(round(beta,2)) +'\n' + r'$R^2$ = ' + str(round(r2, 2)))
In [83]:
histo, bin_edges = np.histogram(data_dict.values(), 15)
bin_center = 0.5*(bin_edges[1:] + bin_edges[:-1])
powerPlot(bin_center,histo, 'lightblue', 'o')
#lg=plt.legend(labels = [u'Tweets', u'Fit'], loc=3, fontsize=20)
plt.ylabel(u'概率', fontproperties=font)
plt.xlabel(u'推特数', fontproperties=font)
plt.show()
In [145]:
import statsmodels.api as sm
from collections import defaultdict
import numpy as np

def powerPlot(data):
    d = sorted(data, reverse = True )
    d_table = defaultdict(int)
    for k in d:
        d_table[k] += 1
    d_value = sorted(d_table)
    d_value = [i+1 for i in d_value]
    d_freq = [d_table[i]+1 for i in d_value]
    d_prob = [float(i)/sum(d_freq) for i in d_freq]
    #d_rank = ss.rankdata(d_value).astype(int)
    x = np.log(d_value)
    y = np.log(d_prob)
    xx = sm.add_constant(x, prepend=True)
    res = sm.OLS(y,xx).fit()
    constant,beta = res.params
    r2 = res.rsquared
    plt.plot(d_value, d_prob, 'co')
    plt.plot(d_value, np.exp(constant+x*beta),"pink")
    plt.xscale('log'); plt.yscale('log')
    plt.text(max(d_value)/2,max(d_prob)/5,
             'Beta = ' + str(round(beta,2)) +'\n' + 'R squared = ' + str(round(r2, 2)))
    plt.title('Distribution')
    plt.ylabel('P(K)')
    plt.xlabel('K')
    plt.show()
In [92]:
powerPlot(data_dict.values())
In [100]:
import powerlaw
def plotPowerlaw(data,ax,col,xlab):
    fit = powerlaw.Fit(data,xmin=2)
    #fit = powerlaw.Fit(data)
    fit.plot_pdf(color = col, linewidth = 2)
    a,x = (fit.power_law.alpha,fit.power_law.xmin)
    fit.power_law.plot_pdf(color = col, linestyle = 'dotted', ax = ax, \
                            label = r"$\alpha = %d \:\:, x_{min} = %d$" % (a,x))
    ax.set_xlabel(t'w, fontsize = 20)
    ax.set_ylabel('$Probability$', fontsize = 20)
    plt.legend(loc = 0, frameon = False)
In [101]:
from collections import defaultdict
data_dict = defaultdict(int)

for i in df1['From User']:
    data_dict[i] += 1
In [103]:
import matplotlib.cm as cm
cmap = cm.get_cmap('rainbow_r',6)

fig = plt.figure(figsize=(6, 4),facecolor='white')
ax = fig.add_subplot(1, 1, 1)
plotPowerlaw(data_dict.values(), ax,cmap(1), '$tweets$')

统计小时相关的数据

In [146]:
from collections import defaultdict
hour_dict = defaultdict(int)
line_num = 0
lines1 = csv.reader((line.replace('\x00','') for line in chunk1[1:]), delimiter=',', quotechar='"')
for i in lines1:
    line_num +=1
    hour_dict[i[5]] +=1
In [128]:
time_dict = defaultdict(int)
for i in hour_dict.values():
    time_dict[i] += 1
 
plt.plot(hour_dict.keys(), hour_dict.values(), 'co',linewidth=2)  
plt.xlabel(u'时间', fontproperties=font)
plt.ylabel(u'推特数', fontproperties=font )
plt.show()
In [149]:
import numpy as np
import statsmodels.api as sm

def powerPlot(d_value, d_freq, color, marker):
    d_freq = [i + 1 for i in d_freq]
    d_prob = [float(i)/sum(d_freq) for i in d_freq]
    #d_rank = ss.rankdata(d_value).astype(int)
    x = np.log(d_value)
    y = np.log(d_prob)
    xx = sm.add_constant(x, prepend=True)
    res = sm.OLS(y,xx).fit()
    constant,beta = res.params
    r2 = res.rsquared
    plt.plot(d_value, d_prob, linestyle = '',  color = color, marker = marker)
    plt.plot(d_value, np.exp(constant+x*beta),"pink")
    plt.xscale('log'); plt.yscale('log')
    plt.text(max(d_value)/2,max(d_prob)/10,
             r'$\beta$ = ' + str(round(beta,2)) +'\n' + r'$R^2$ = ' + str(round(r2, 2)))
In [151]:
histo, bin_edges = np.histogram(hour_dict.values(), 15)
bin_center = 0.5*(bin_edges[1:] + bin_edges[:-1])
powerPlot(bin_center,histo, 'lightblue', 'o')
#lg=plt.legend(labels = [u'Tweets', u'Fit'], loc=3, fontsize=20)
plt.ylabel(u'概率', fontproperties=font)
plt.xlabel(u'时间', fontproperties=font)
plt.show()

文本分析部分

In [104]:
import twitter_text
In [157]:
import re

tweet = '''RT @AnonKitsu: ALERT!!!!!!!!!!COPS ARE KETTLING PROTESTERS IN PARK W HELICOPTERS AND PADDYWAGONS!!!! 
            #OCCUPYWALLSTREET #OWS #OCCUPYNY PLEASE @chengjun @mili http://computational-communication.com 
            http://ccc.nju.edu.cn RT !!HELP!!!!'''

rt_patterns = re.compile(r"(RT|via)((?:\b\W*@\w+)+)", \
                         re.IGNORECASE)
rt_user_name = rt_patterns.findall(tweet)[0][1].strip(' @')
rt_user_name
print rt_user_name

if rt_user_name:
    print 'it exits.'
else:
    print 'None'
AnonKitsu
it exits.
In [158]:
def extract_rt_user(tweet):
    rt_patterns = re.compile(r"(RT|via)((?:\b\W*@\w+)+)", re.IGNORECASE)
    rt_user_name = rt_patterns.findall(tweet)
    if rt_user_name:
        rt_user_name = rt_user_name[0][1].strip(' @')
    else:
        rt_user_name = None
    return rt_user_name
In [160]:
def extract_tweet_text(tweet, at_names, urls):
    for i in at_names:
        tweet = tweet.replace(i, '')
    for j in urls:
        tweet = tweet.replace(j, '')
    marks = ['RT @', '@', '"', '#', '\n', '\t', '  ']
    for k in marks:
        tweet = tweet.replace(k, '')
    return tweet
In [161]:
import twitter_text

tweet = '''RT @AnonKitsu: ALERT!!!!!!!!!!COPS ARE KETTLING PROTESTERS IN PARK W HELICOPTERS AND PADDYWAGONS!!!! 
            #OCCUPYWALLSTREET #OWS #OCCUPYNY PLEASE @chengjun @mili http://computational-communication.com 
            http://ccc.nju.edu.cn RT !!HELP!!!!'''

ex = twitter_text.Extractor(tweet)
at_names = ex.extract_mentioned_screen_names()
urls = ex.extract_urls()
hashtags = ex.extract_hashtags()
rt_user = extract_rt_user(tweet)
tweet_text = extract_tweet_text(tweet, at_names, urls)

print at_names, urls, hashtags, rt_user,'-------->', tweet_text
[u'AnonKitsu', u'chengjun', u'mili'] [u'http://computational-communication.com', u'http://ccc.nju.edu.cn'] [u'OCCUPYWALLSTREET', u'OWS', u'OCCUPYNY'] AnonKitsu --------> : ALERT!!!!!!!!!!COPS ARE KETTLING PROTESTERS IN PARK W HELICOPTERS AND PADDYWAGONS!!!! OCCUPYWALLSTREET OWS OCCUPYNY PLEASE RT !!HELP!!!!
In [162]:
chunk1[:5]
Out[162]:
['Twitter ID,Text,Profile Image URL,Day,Hour,Minute,Created At,Geo,From User,From User ID,Language,To User,To User ID,Source     \n',
 " 6263754,162686269481156608,The 27 Republican Bills That Aren't About Jobs  http://t.co/fhcVmqwR  #ows #occupy #opdx,http://a3.twimg.com/profile_images/1739660175/african.violet.5_normal.jpg,2012-01-27,0,0,2012-01-27 00:00:03,N;,zengreen,170208476,en,,0,<a href="http://www.tweetdeck.com" rel="nofollow">TweetDeck</a>     \n",
 ' 6263755,162686280986140672,"For those interested in #FinancialFriday, the weekly bank march in San Jose, here\'s more info: http://t.co/cnKv1UJz #OO #OSJ #OSF #OWS",http://a1.twimg.com/profile_images/1751422937/DSC06020_normal.jpg,2012-01-27,0,0,2012-01-27 00:00:06,N;,OccupyManJose,386911819,en,,0,<a href="http://twitter.com/">web</a>     \n',
 " 6263756,162686293048954880,The 27 Republican Bills That Aren't About Jobs  http://t.co/cJnc1HT1  #ows #occupy #opdx,http://a0.twimg.com/profile_images/1209829084/spalshes2_normal.jpg,2012-01-27,0,0,2012-01-27 00:00:09,N;,dohlink,21453088,en,,0,<a href="http://www.tweetdeck.com" rel="nofollow">TweetDeck</a>     \n",
 ' 6263757,162686297276817408,"RT @TheNewDeal: Dear Republicans, ALL YOUR ANCESTORS IMMIGRATED HERE. If You\'re Not a Native American Indian, Shut the Hell Up. #OWS #p2 #tcot",http://a1.twimg.com/profile_images/1361063617/Don_Cooley_-_Facebook_-_19May11_normal.jpg,2012-01-27,0,0,2012-01-27 00:00:10,N;,CoolR1a,289005244,en,,0,<a href="http://twitter.com/">web</a>     \n']
In [163]:
import csv

lines = csv.reader((line.replace('\x00','') for line in chunk1[1:]), delimiter=',', quotechar='"')
tweets = [i[2] for i in lines]
In [193]:
len(tweets)
Out[193]:
11992
In [181]:
import sys
reload(sys)
sys.setdefaultencoding('utf8')
In [182]:
for tweet in tweets[0:11991]:
    ex = twitter_text.Extractor(tweet)
    at_names = ex.extract_mentioned_screen_names()
    urls = ex.extract_urls()
    hashtags = ex.extract_hashtags()
    rt_user = extract_rt_user(tweet)
    tweet_text = extract_tweet_text(tweet, at_names, urls)
    tweetsclean.append(tweet_text)
In [185]:
tweetsclean[-5:]
Out[185]:
[u': THE ECONOMY IS IMPROVING UNDER OBAMA: Economy Grew By 2.8% Last QuarterRt OWS p2 tcot',
 u'Twas an excellent day to shut Chase Bank DOWN!!! OWS OWSAtlanta ',
 u': Daily Kos: Wall Street donors swamp all others in election\xa0spendingows MoveToAmend p2',
 u'. we ows agree, comrade.We need to outlaw the concept of $ and go back to living in the woods and trading livestock.',
 u": We're not leaving. Give the Pittmans the deed back. occupyatlanta ows "]
In [186]:
len(tweetsclean)
Out[186]:
12090
In [189]:
#保存list为text
fl=open('tweetsclean.txt', 'w')
for i in tweetsclean:
    fl.write(i)
    fl.write("\n")
fl.close()

情感分析

In [249]:
import nltk
In [250]:
pos_tweets = [('If you have been swept away by the passion of ows ,\
then you know how important of a tool twitter was and still is. Twittercensored', 'postive'),
    ('OccupyMidWest wepay - Please post, forward, send out, \
    tweet, orwhatever you ca...: OccupyMidW...ows occupympls', 'postive'),
    ('OWS has left an indelible impression on the GOP race; time \
    will tell what effect it has on the Democratic side politics\
Water Has A “Memory” |Watch:tcot tlot ows p2 teaparty', 'postive'),
    ('President Obama begins a federal investigation into Wall Street \
    ows occupy politics sotu', 'postive'),
    ('Don\'t Decolonize Your Mind, Decolonize Our LandNativeAmerican \
    indigenous NDNZ OWS anarchist topprog p2 ...', 'postive')]

neg_tweets = [('Dear Republicans, ALL YOUR ANCESTORS IMMIGRATED HERE. \
               If You\'re Not a Native American Indian, Shut the Hell Up. OWS p2 tcot', 'negative'),
    ('WTF?! G3t your FILTHY hand5 0ff my int3rnet!censorship stopSOPA \
    stopACTA stopHR1981 stopPIPA OWS ANONYMOUS', 'negative'),
    ('Yo, Twitter. I will not be silenced and \
    I will not shut the fuck up about OWS, \
    ACTA, NDAA, EEA, SOPA, or PIPA. TwitterCensorship', 'negative'),
    ('a thin skinned jerk who is 2 egomaniacal 2 deal with criticism in a book “: \
    racist bias haters liars p2 wiunion ows”Jägermeister ♬▼☛★ows', 'negative'),
    ('Hey , go ask Bin Laden (mastermind of 9/11) if Pres. \
    Obama appeases our enemies! Idiot. gop p2 ows tcot teaparty Hardball', 'negative')]
In [251]:
tweets = []
for (words, sentiment) in pos_tweets + neg_tweets:
    words_filtered = [e.lower() for e in words.split() if len(e) >= 3]
    tweets.append((words_filtered, sentiment))
tweets[:2]
Out[251]:
[(['you',
   'have',
   'been',
   'swept',
   'away',
   'the',
   'passion',
   'ows',
   ',then',
   'you',
   'know',
   'how',
   'important',
   'tool',
   'twitter',
   'was',
   'and',
   'still',
   'is.',
   'twittercensored'],
  'postive'),
 (['occupymidwest',
   'wepay',
   'please',
   'post,',
   'forward,',
   'send',
   'out,',
   'tweet,',
   'orwhatever',
   'you',
   'ca...:',
   'occupymidw...ows',
   'occupympls'],
  'postive')]
In [252]:
test_tweets = [
    (['feel', 'happy', 'Please', 'passion'], 'postive'),
    (['wtf', 'shut up'], 'negative'),
    (['fuck', 'WTF'], 'negative'),
    (['nuts', 'jerk', 'great'], 'negative'),
    (['shit', 'angry', 'annoying'], 'negative')]
In [253]:
def get_words_in_tweets(tweets):
    all_words = []
    for (words, sentiment) in tweets:
        all_words.extend(words)
    return all_words

def get_word_features(wordlist):
    wordlist = nltk.FreqDist(wordlist)
    word_features = wordlist.keys()
    return word_features

word_features = get_word_features(get_words_in_tweets(tweets))
' '.join(word_features)
Out[253]:
"impression all pres. don't politicswater hand5 hell still stophr1981 up. g3t with egomaniacal 0ff stopacta send post, has hardball twittercensorship ... jerk indigenous know anonymous not int3rnet!censorship acta, tweet, idiot. liars side swept twitter. twitter street orwhatever dear our ca...: wtf?! federal away please landnativeamerican mind, appeases sotu passion is. stopsopa \xe2\x80\x9c: gop ows, stoppipa shut tcot tlot effect ask immigrated about occupymidwest american here. anarchist out, criticism wall into eea, wiunion race; your ows you're deal occupy hey been occupymidw...ows racist teaparty was tell \xe2\x99\xac\xe2\x96\xbc\xe2\x98\x9b\xe2\x98\x85ows silenced tool ,then investigation forward, haters |watch:tcot obama begins topprog sopa, wepay will indian, thin and bin occupympls ndaa, fuck bias have twittercensored politics native ancestors laden decolonize \xe2\x80\x9cmemory\xe2\x80\x9d how book you 9/11) (mastermind ows\xe2\x80\x9dj\xc3\xa4germeister skinned indelible who what ndnz important president filthy republicans, enemies! yo, pipa. time the democratic left"
In [254]:
def extract_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
    return features
In [255]:
training_set = nltk.classify.util.apply_features(extract_features,tweets)
classifier = nltk.NaiveBayesClassifier.train(training_set)
In [256]:
def train(labeled_featuresets, estimator=nltk.probability.ELEProbDist):
    label_probdist = estimator(label_freqdist)
    feature_probdist = {}
    model = NaiveBayesClassifier(label_probdist, feature_probdist)
    return model
In [276]:
tweet_test = ': Occupy the Dream joins the Occupy Wallstreet movement OWS '
a = classifier.classify(extract_features(tweet_test.split()))
a
Out[276]:
'postive'
In [277]:
tweet_test2 = 'we ows agree, comrade.We need to outlaw the concept \
of $ and go back to living in the woods and trading livestock.'
b=classifier.classify(extract_features(tweet_test2.split()))
b
Out[277]:
'postive'
In [278]:
from sklearn.svm import LinearSVC
from nltk.classify.scikitlearn import SklearnClassifier
classif = SklearnClassifier(LinearSVC())
svm_classifier = classif.train(training_set)
In [279]:
tweet_negative2 = ': i\'ve fallen in love with the badass members of Poland\'s parliament\
protesting ACTA :)Anonymous OWS SOPA'
c = svm_classifier.classify(extract_features(tweet_negative2.split()))
c
Out[279]:
'negative'
In [281]:
def classify_tweet(tweet):
    return classifier.classify(extract_features(tweet)) 
   
total = accuracy = float(len(test_tweets))

for tweet in test_tweets:
    if classify_tweet(tweet[0]) != tweet[1]:
        accuracy -= 1

d ='Total accuracy: %f%% (%d/20).' % (accuracy / total * 100, accuracy)
d
Out[281]:
'Total accuracy: 60.000000% (3/20).'
In [283]:
for i in tweetsclean:
    tweet_test = i
    a = classifier.classify(extract_features(tweet_test.split()))
    a