with open("/Users/YOLLY/desktop/cjc/tweet0127.dat", \
'rb') as f:
tweets1 = f.readlines()
len(tweets1)
varNames = tweets1[0].replace('\n', '').split(',')
varNames
with open("/Users/YOLLY/desktop/cjc/tweet0127.dat", 'w') as f:
right_line = ''
blocks = []
for line in tweets1:
right_line += line.replace('\n', ' ')
line_length = len(right_line.split(','))
if line_length >= 14:
blocks.append(right_line)
right_line = ''
for i in blocks:
f.write(i + '\n')
len(blocks)
with open("/Users/YOLLY/desktop/cjc/tweet0127.dat", 'w') as f:
right_line = ''
blocks = []
for line in tweets1:
right_line += line.replace('\n', ' ').replace('\r', ' ')
line_length = len(right_line.split(','))
if line_length >= 14:
blocks.append(right_line)
right_line = ''
for i in blocks:
f.write(i + '\n')
with open("/Users/YOLLY/desktop/cjc/tweet0127.dat", 'rb') as f:
chunk1 = f.readlines()
len(chunk1)
chunk1[:3]
import csv
clean_lines1 = (line.replace('\x00','') \
for line in chunk1[1:])
lines1 = csv.reader(clean_lines1, delimiter=',', \
quotechar='"')
import pandas as pd
df1 = pd.read_csv("/Users/YOLLY/desktop/cjc/tweet0127.dat",\
sep = ',', quotechar='"')
df1[:3]
df1.Text[6263754]
from collections import defaultdict
data_dict = defaultdict(int)
line_num = 0
lines = csv.reader((line.replace('\x00','') for line in chunk1[1:]), delimiter=',', quotechar='"')
for i in lines:
line_num +=1
data_dict[i[9]] +=1
import csv
lines = csv.reader((line.replace('\x00','') for line in chunk1[1:]), delimiter=',', quotechar='"')
users = [i[9] for i in lines]
lines = csv.reader((line.replace('\x00','') for line in chunk1[1:]), delimiter=',', quotechar='"')
tweets =[i[2] for i in lines]
a=[]
for i in range(len(users)):
a.append([users[i],tweets[i]])
i=i+1
import pandas as pd
dat = pd.DataFrame(data=a, columns = ['users', 'tweets'])
dat.describe()
%matplotlib inline
from matplotlib.font_manager import FontProperties
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rc("savefig", dpi=100)
font = FontProperties(fname=r'/Users/YOLLY/desktop/cjc/msyh.ttf', size=14)
tweet_dict = defaultdict(int)
for i in data_dict.values():
tweet_dict[i] += 1
plt.loglog(tweet_dict.keys(), tweet_dict.values(), 'co',linewidth=2)
plt.xlim((0, 1000))
plt.xlabel(u'推特数', fontproperties=font)
plt.ylabel(u'人数', fontproperties=font )
plt.show()
import numpy as np
import statsmodels.api as sm
def powerPlot(d_value, d_freq, color, marker):
d_freq = [i + 1 for i in d_freq]
d_prob = [float(i)/sum(d_freq) for i in d_freq]
#d_rank = ss.rankdata(d_value).astype(int)
x = np.log(d_value)
y = np.log(d_prob)
xx = sm.add_constant(x, prepend=True)
res = sm.OLS(y,xx).fit()
constant,beta = res.params
r2 = res.rsquared
plt.plot(d_value, d_prob, linestyle = '', color = color, marker = marker)
plt.plot(d_value, np.exp(constant+x*beta),"pink")
plt.xscale('log'); plt.yscale('log')
plt.text(max(d_value)/2,max(d_prob)/10,
r'$\beta$ = ' + str(round(beta,2)) +'\n' + r'$R^2$ = ' + str(round(r2, 2)))
histo, bin_edges = np.histogram(data_dict.values(), 15)
bin_center = 0.5*(bin_edges[1:] + bin_edges[:-1])
powerPlot(bin_center,histo, 'lightblue', 'o')
#lg=plt.legend(labels = [u'Tweets', u'Fit'], loc=3, fontsize=20)
plt.ylabel(u'概率', fontproperties=font)
plt.xlabel(u'推特数', fontproperties=font)
plt.show()
import statsmodels.api as sm
from collections import defaultdict
import numpy as np
def powerPlot(data):
d = sorted(data, reverse = True )
d_table = defaultdict(int)
for k in d:
d_table[k] += 1
d_value = sorted(d_table)
d_value = [i+1 for i in d_value]
d_freq = [d_table[i]+1 for i in d_value]
d_prob = [float(i)/sum(d_freq) for i in d_freq]
#d_rank = ss.rankdata(d_value).astype(int)
x = np.log(d_value)
y = np.log(d_prob)
xx = sm.add_constant(x, prepend=True)
res = sm.OLS(y,xx).fit()
constant,beta = res.params
r2 = res.rsquared
plt.plot(d_value, d_prob, 'co')
plt.plot(d_value, np.exp(constant+x*beta),"pink")
plt.xscale('log'); plt.yscale('log')
plt.text(max(d_value)/2,max(d_prob)/5,
'Beta = ' + str(round(beta,2)) +'\n' + 'R squared = ' + str(round(r2, 2)))
plt.title('Distribution')
plt.ylabel('P(K)')
plt.xlabel('K')
plt.show()
powerPlot(data_dict.values())
import powerlaw
def plotPowerlaw(data,ax,col,xlab):
fit = powerlaw.Fit(data,xmin=2)
#fit = powerlaw.Fit(data)
fit.plot_pdf(color = col, linewidth = 2)
a,x = (fit.power_law.alpha,fit.power_law.xmin)
fit.power_law.plot_pdf(color = col, linestyle = 'dotted', ax = ax, \
label = r"$\alpha = %d \:\:, x_{min} = %d$" % (a,x))
ax.set_xlabel(t'w, fontsize = 20)
ax.set_ylabel('$Probability$', fontsize = 20)
plt.legend(loc = 0, frameon = False)
from collections import defaultdict
data_dict = defaultdict(int)
for i in df1['From User']:
data_dict[i] += 1
import matplotlib.cm as cm
cmap = cm.get_cmap('rainbow_r',6)
fig = plt.figure(figsize=(6, 4),facecolor='white')
ax = fig.add_subplot(1, 1, 1)
plotPowerlaw(data_dict.values(), ax,cmap(1), '$tweets$')
from collections import defaultdict
hour_dict = defaultdict(int)
line_num = 0
lines1 = csv.reader((line.replace('\x00','') for line in chunk1[1:]), delimiter=',', quotechar='"')
for i in lines1:
line_num +=1
hour_dict[i[5]] +=1
time_dict = defaultdict(int)
for i in hour_dict.values():
time_dict[i] += 1
plt.plot(hour_dict.keys(), hour_dict.values(), 'co',linewidth=2)
plt.xlabel(u'时间', fontproperties=font)
plt.ylabel(u'推特数', fontproperties=font )
plt.show()
import numpy as np
import statsmodels.api as sm
def powerPlot(d_value, d_freq, color, marker):
d_freq = [i + 1 for i in d_freq]
d_prob = [float(i)/sum(d_freq) for i in d_freq]
#d_rank = ss.rankdata(d_value).astype(int)
x = np.log(d_value)
y = np.log(d_prob)
xx = sm.add_constant(x, prepend=True)
res = sm.OLS(y,xx).fit()
constant,beta = res.params
r2 = res.rsquared
plt.plot(d_value, d_prob, linestyle = '', color = color, marker = marker)
plt.plot(d_value, np.exp(constant+x*beta),"pink")
plt.xscale('log'); plt.yscale('log')
plt.text(max(d_value)/2,max(d_prob)/10,
r'$\beta$ = ' + str(round(beta,2)) +'\n' + r'$R^2$ = ' + str(round(r2, 2)))
histo, bin_edges = np.histogram(hour_dict.values(), 15)
bin_center = 0.5*(bin_edges[1:] + bin_edges[:-1])
powerPlot(bin_center,histo, 'lightblue', 'o')
#lg=plt.legend(labels = [u'Tweets', u'Fit'], loc=3, fontsize=20)
plt.ylabel(u'概率', fontproperties=font)
plt.xlabel(u'时间', fontproperties=font)
plt.show()
import twitter_text
import re
tweet = '''RT @AnonKitsu: ALERT!!!!!!!!!!COPS ARE KETTLING PROTESTERS IN PARK W HELICOPTERS AND PADDYWAGONS!!!!
#OCCUPYWALLSTREET #OWS #OCCUPYNY PLEASE @chengjun @mili http://computational-communication.com
http://ccc.nju.edu.cn RT !!HELP!!!!'''
rt_patterns = re.compile(r"(RT|via)((?:\b\W*@\w+)+)", \
re.IGNORECASE)
rt_user_name = rt_patterns.findall(tweet)[0][1].strip(' @')
rt_user_name
print rt_user_name
if rt_user_name:
print 'it exits.'
else:
print 'None'
def extract_rt_user(tweet):
rt_patterns = re.compile(r"(RT|via)((?:\b\W*@\w+)+)", re.IGNORECASE)
rt_user_name = rt_patterns.findall(tweet)
if rt_user_name:
rt_user_name = rt_user_name[0][1].strip(' @')
else:
rt_user_name = None
return rt_user_name
def extract_tweet_text(tweet, at_names, urls):
for i in at_names:
tweet = tweet.replace(i, '')
for j in urls:
tweet = tweet.replace(j, '')
marks = ['RT @', '@', '"', '#', '\n', '\t', ' ']
for k in marks:
tweet = tweet.replace(k, '')
return tweet
import twitter_text
tweet = '''RT @AnonKitsu: ALERT!!!!!!!!!!COPS ARE KETTLING PROTESTERS IN PARK W HELICOPTERS AND PADDYWAGONS!!!!
#OCCUPYWALLSTREET #OWS #OCCUPYNY PLEASE @chengjun @mili http://computational-communication.com
http://ccc.nju.edu.cn RT !!HELP!!!!'''
ex = twitter_text.Extractor(tweet)
at_names = ex.extract_mentioned_screen_names()
urls = ex.extract_urls()
hashtags = ex.extract_hashtags()
rt_user = extract_rt_user(tweet)
tweet_text = extract_tweet_text(tweet, at_names, urls)
print at_names, urls, hashtags, rt_user,'-------->', tweet_text
chunk1[:5]
import csv
lines = csv.reader((line.replace('\x00','') for line in chunk1[1:]), delimiter=',', quotechar='"')
tweets = [i[2] for i in lines]
len(tweets)
import sys
reload(sys)
sys.setdefaultencoding('utf8')
for tweet in tweets[0:11991]:
ex = twitter_text.Extractor(tweet)
at_names = ex.extract_mentioned_screen_names()
urls = ex.extract_urls()
hashtags = ex.extract_hashtags()
rt_user = extract_rt_user(tweet)
tweet_text = extract_tweet_text(tweet, at_names, urls)
tweetsclean.append(tweet_text)
tweetsclean[-5:]
len(tweetsclean)
#保存list为text
fl=open('tweetsclean.txt', 'w')
for i in tweetsclean:
fl.write(i)
fl.write("\n")
fl.close()
import nltk
pos_tweets = [('If you have been swept away by the passion of ows ,\
then you know how important of a tool twitter was and still is. Twittercensored', 'postive'),
('OccupyMidWest wepay - Please post, forward, send out, \
tweet, orwhatever you ca...: OccupyMidW...ows occupympls', 'postive'),
('OWS has left an indelible impression on the GOP race; time \
will tell what effect it has on the Democratic side politics\
Water Has A “Memory” |Watch:tcot tlot ows p2 teaparty', 'postive'),
('President Obama begins a federal investigation into Wall Street \
ows occupy politics sotu', 'postive'),
('Don\'t Decolonize Your Mind, Decolonize Our LandNativeAmerican \
indigenous NDNZ OWS anarchist topprog p2 ...', 'postive')]
neg_tweets = [('Dear Republicans, ALL YOUR ANCESTORS IMMIGRATED HERE. \
If You\'re Not a Native American Indian, Shut the Hell Up. OWS p2 tcot', 'negative'),
('WTF?! G3t your FILTHY hand5 0ff my int3rnet!censorship stopSOPA \
stopACTA stopHR1981 stopPIPA OWS ANONYMOUS', 'negative'),
('Yo, Twitter. I will not be silenced and \
I will not shut the fuck up about OWS, \
ACTA, NDAA, EEA, SOPA, or PIPA. TwitterCensorship', 'negative'),
('a thin skinned jerk who is 2 egomaniacal 2 deal with criticism in a book “: \
racist bias haters liars p2 wiunion ows”Jägermeister ♬▼☛★ows', 'negative'),
('Hey , go ask Bin Laden (mastermind of 9/11) if Pres. \
Obama appeases our enemies! Idiot. gop p2 ows tcot teaparty Hardball', 'negative')]
tweets = []
for (words, sentiment) in pos_tweets + neg_tweets:
words_filtered = [e.lower() for e in words.split() if len(e) >= 3]
tweets.append((words_filtered, sentiment))
tweets[:2]
test_tweets = [
(['feel', 'happy', 'Please', 'passion'], 'postive'),
(['wtf', 'shut up'], 'negative'),
(['fuck', 'WTF'], 'negative'),
(['nuts', 'jerk', 'great'], 'negative'),
(['shit', 'angry', 'annoying'], 'negative')]
def get_words_in_tweets(tweets):
all_words = []
for (words, sentiment) in tweets:
all_words.extend(words)
return all_words
def get_word_features(wordlist):
wordlist = nltk.FreqDist(wordlist)
word_features = wordlist.keys()
return word_features
word_features = get_word_features(get_words_in_tweets(tweets))
' '.join(word_features)
def extract_features(document):
document_words = set(document)
features = {}
for word in word_features:
features['contains(%s)' % word] = (word in document_words)
return features
training_set = nltk.classify.util.apply_features(extract_features,tweets)
classifier = nltk.NaiveBayesClassifier.train(training_set)
def train(labeled_featuresets, estimator=nltk.probability.ELEProbDist):
label_probdist = estimator(label_freqdist)
feature_probdist = {}
model = NaiveBayesClassifier(label_probdist, feature_probdist)
return model
tweet_test = ': Occupy the Dream joins the Occupy Wallstreet movement OWS '
a = classifier.classify(extract_features(tweet_test.split()))
a
tweet_test2 = 'we ows agree, comrade.We need to outlaw the concept \
of $ and go back to living in the woods and trading livestock.'
b=classifier.classify(extract_features(tweet_test2.split()))
b
from sklearn.svm import LinearSVC
from nltk.classify.scikitlearn import SklearnClassifier
classif = SklearnClassifier(LinearSVC())
svm_classifier = classif.train(training_set)
tweet_negative2 = ': i\'ve fallen in love with the badass members of Poland\'s parliament\
protesting ACTA :)Anonymous OWS SOPA'
c = svm_classifier.classify(extract_features(tweet_negative2.split()))
c
def classify_tweet(tweet):
return classifier.classify(extract_features(tweet))
total = accuracy = float(len(test_tweets))
for tweet in test_tweets:
if classify_tweet(tweet[0]) != tweet[1]:
accuracy -= 1
d ='Total accuracy: %f%% (%d/20).' % (accuracy / total * 100, accuracy)
d
for i in tweetsclean:
tweet_test = i
a = classifier.classify(extract_features(tweet_test.split()))
a