This repository has been archived by the owner on Nov 4, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
tweetstotxt.py
50 lines (42 loc) · 1.69 KB
/
tweetstotxt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
__author__ = 'croman'
import codecs
from lxml import etree
import rdflib
def convert(datasetfile, format):
tweets = ''
tweetids = []
if format == 'xml-collection':
dataset = etree.parse(datasetfile)
for tweet in dataset.xpath('//Tweet'):
tweetText = tweet.xpath('./TweetText/text()')[0]
tweets += tweetText+"\n"
tweetids.append(tweet.xpath('./TweetId/text()')[0])
tweets = tweets.encode('utf-8')
with codecs.open(datasetfile.split('.xml')[0]+'.txt', 'wb', encoding='utf-8') as txt:
tweets = tweets.decode('utf-8')
txt.write(tweets)
elif format == 'xml-socialtv':
dataset = etree.parse(datasetfile)
for tweet in dataset.xpath('//tweet'):
tweetText = tweet.xpath('./text()')[0]
tweets += tweetText+'\n'
tweetids.append(tweet.get('id'))
tweets = tweets.encode('utf-8')
with codecs.open(datasetfile.split('.xml')[0]+'.txt', 'wb', encoding='utf-8') as txt:
tweets = tweets.decode('utf-8')
txt.write(tweets)
elif format == 'nif':
tweetdict = {}
a = rdflib.Graph()
a.parse(datasetfile, format='n3')
for s, p, o in a:
if s.endswith(',') and p.endswith('isString'):
tweetid = s.split('#')[0].split('.xml/')[1]
tweetdict[tweetid] = o
for key in sorted(tweetdict):
tweetids.append(key)
tweets += tweetdict[key]+'\n'
tweets = tweets.encode('utf-8')
with codecs.open(datasetfile.split('.ttl')[0]+'.txt', 'wb', encoding='utf-8') as txt:
tweets = tweets.decode('utf-8')
txt.write(tweets)