甄子锐
bigfile = open('/Users/abner/github/data/ows-raw.txt', 'rb')
chunkSize = 10000000
chunk = bigfile.readlines(chunkSize)
block = ''
with open("/Users/abner/github/data/ows-raw_clean.txt", 'w') as f:
while chunk:
blocks = []
for line in chunk:
block += line.replace('\n', '')
block_length = len(block.split(','))
if block_length >= 14:
blocks.append(block)
block = ''
for i in blocks:
f.write(i + '\n')
chunk = bigfile.readlines(chunkSize)
from collections import defaultdict
import csv
data_dict = defaultdict(list)
error_num = 0
line_num = 0
total_num = 0
bigfile = open('/Users/abner/github/data/ows-raw_clean.txt', 'rb')
chunkSize = 100000000
chunk = bigfile.readlines(chunkSize)
while chunk:
total_num += len(chunk)
lines = csv.reader((line.replace('\x00','') for line in chunk), delimiter=',', quotechar='"')
for i in lines:
line_num +=1
try:
date = i[3]
if len(date) == 10:#正确的行被计入
data_dict[date].append(i[8])
else:
error_num+=1
except:
pass
chunk = bigfile.readlines(chunkSize)
print line_num, total_num, error_num
" , ".join(data_dict.keys())
import pandas as pd
data = [[i, len(data_dict[i]), len(set(data_dict[i]))] for i in data_dict]
dat = pd.DataFrame(data, columns = ['date', 'tweets', 'users'])
dat.date = pd.to_datetime(dat.date)
dat = dat.sort(['date'])
import numpy as np
np.sum(dat.tweets)
dat.describe()
%matplotlib inline
import matplotlib.cm as cm
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(15, 4),facecolor='white')
plt.plot(dat.date, dat.tweets, 'r-o', label = "Tweets")
plt.plot(dat.date, dat.users, 'g-^', label = "Users")
plt.legend(loc=2,fontsize=8)
plt.yscale('log')
plt.show()
dat
ows_data=[] date='2011-10-13' for line in chunk[1:]: div_line = line.replace('\n', ' ') lines = div_line.split(',') if lines[3] == date: ows_data.append(line)
len(ows_data)
with open("/Users/abner/github/data/ows_20111013.txt", 'w') as f:
for i in ows_data:
f.write(i + '\n')
#情感分析
import nltk
from nltk.classify.naivebayes import NaiveBayesClassifier
def get_words_in_tweets(tweets):
all_words = []
for (words, sentiment) in tweets:
all_words.extend(words)
return all_words
def get_word_features(wordlist):
wordlist = nltk.FreqDist(wordlist)
word_features = wordlist.keys()
return word_features
def read_tweets(fname, t_type):
tweets = []
f = open(fname, 'r')
line = f.readline()
while line != '':
tweets.append([line, t_type])
line = f.readline()
f.close()
return tweets
def extract_features(document):
document_words = set(document)
features = {}
for word in word_features:
features['contains(%s)' % word] = (word in document_words)
return features
def classify_tweet(tweet):
return classifier.classify(extract_features(nltk.word_tokenize(tweet)))
# read in postive and negative training tweets
pos_tweets = read_tweets('/Users/abner/github/data/happy.txt', 'positive')
neg_tweets = read_tweets('/Users/abner/github/data/sad.txt', 'negative')
# 过滤词
tweets = []
for (words, sentiment) in pos_tweets + neg_tweets:
words_filtered = [e.lower() for e in words.split() if len(e) >= 3]
tweets.append((words_filtered, sentiment))
# extract the word features out from the training data
word_features = get_word_features(get_words_in_tweets(tweets))
# get the training set and train the Naive Bayes Classifier
training_set = nltk.classify.util.apply_features(extract_features, tweets)
classifier = NaiveBayesClassifier.train(training_set)
# to add your own test tweets, add them in the respective files
test_tweets = read_tweets('/Users/abner/github/data/happy_test.txt', 'positive')
test_tweets.extend(read_tweets('/Users/abner/github/data/sad_test.txt', 'negative'))
total = accuracy = float(len(test_tweets))
for i in test_tweets[:5]:
print i[0]
for tweet in test_tweets:
if classify_tweet(tweet[0]) != tweet[1]:
accuracy -= 1
print('Total accuracy: %f%% (%d/20).' % (accuracy / total * 100, accuracy))
tweet_emotion=[]
for tweet in blocks:
tweet_emotion.append(classify_tweet(unicode(tweet, "utf8")))
print tweet_emotion[:5]
print len(tweet_emotion)
count1 = {}#定义字典,存放时间和每天的数量
for r in tweet_emotion:
count1[r] = count1.get(r, 0) + 1
print(count1)
import matplotlib.pyplot as plt
labels = 'positive', 'negative'
sizes = [3068, 5851]
colors = ['red', 'lightgreen']
plt.pie(sizes, labels=labels, colors=colors,shadow=True, startangle=90)
plt.show()