In [1]:
import os
import glob
import json
import pandas as pd

In [16]:
fnames = glob.glob('scrapped/twitter/feed/*.json')[:10]; fnames

['scrapped/twitter/feed/2021-05-29T09;45;38+09;00.json',
 'scrapped/twitter/feed/2021-04-11T10;45;41+09;00.json',
 'scrapped/twitter/feed/2021-06-29T23;58;49+09;00.json',
 'scrapped/twitter/feed/2021-07-29T16;02;30+09;00.json',
 'scrapped/twitter/feed/2021-08-24T14;36;09+09;00.json',
 'scrapped/twitter/feed/2021-05-08T13;43;13+09;00.json',
 'scrapped/twitter/feed/2021-03-26T16;48;58+09;00.json',
 'scrapped/twitter/feed/2021-03-01T19;04;13+09;00.json',
 'scrapped/twitter/feed/2021-07-08T12;05;53+09;00.json',
 'scrapped/twitter/feed/2021-03-23T22;09;31+09;00.json']

In [37]:
class JsonCorpus:
    def __init__(self, textkey='content'):
        self.feed_dir = 'scrapped/twitter/feed/'
        self.user_dir = 'scrapped/twitter/user/'
        self.feed_files = self._feed_files
        self.textkey = textkey
        self.corpus = self._corpus()
    
    @property
    def _feed_files(self):
        return glob.glob(self.feed_dir + '*.json')[:1000]
        
    def _user(self, username):
        ufile = self.user_dir + '/' + username + '.json'
        with open(ufile, encoding='UTF-8-sig') as f:
            _u = json.load(f)

        return _u
        
    def _corpus(self):
        corpus = {}
        nfiles = len(self.feed_files)
        tmp = []
        
        for i, fname in enumerate(self.feed_files):
            with open(fname, encoding='UTF-8-sig') as f:
                _js = json.load(f)
                
                for _brand in _js['brands']:
                    if _brand not in corpus:
                        corpus[_brand] = []
                        
                    corpus[_brand].append(_js[self.textkey])
                
                pct = '%.2f' % (100 * (i+1) / nfiles)
                #print('\r {pct}% completed'.format(pct=pct), end='')
                
                user = self._user(_js['user'])
                #print(user['followersCount'])
                tmp.append({
                    'nfollowers': user['followersCount'],
                    'nfriends': user['friendsCount'],
                    'nstatuses': user['statusesCount'],
                    'nfavourites': user['favouritesCount'],
                    'nlisted': user['listedCount'],
                    'content': _js[self.textkey],
                    'nreply': _js['replyCount'],
                    'nretweet': _js['retweetCount'],
                    'nlike': _js['likeCount'],
                    'nquote': _js['quoteCount']
                })
        
        print('\n')
        #return corpus
        return pd.DataFrame(tmp)
    
                
    def __iter__(self):
        for sents in self.corpus.values():
            yield from sents

    def __len__(self):
        return sum([len(sents) for sents in self.corpus.values()])
    
    def tokenize(self, tagger):
        return DocTokens(tagger, **self.corpus)



In [38]:
jcorpus = JsonCorpus(); jcorpus





<__main__.JsonCorpus at 0x7feac1eac9d0>

In [39]:
jcorpus.corpus.to_excel('test.xlsx')

In [7]:
feed_dir = 'scrapped/twitter/feed/'
len(glob.glob(feed_dir + '*.json'))

504838

In [44]:
%time len(glob.glob(feed_dir + '*.json'))

CPU times: user 1.12 s, sys: 737 ms, total: 1.86 s
Wall time: 1.96 s


820884

In [41]:
ff = glob.glob(feed_dir + '*.json')[-1]; ff

'scrapped/twitter/feed/2021-08-26T08;33;24+09;00.json'