In [1]:
import logging
import json

from naruto_skills import solr
import pandas as pd

In [2]:
logging.basicConfig(level=logging.INFO)

In [3]:
list_topics = ['6732', '4084', '2638', '3245', '23709', '23708', '7798', '40266', '35786']

In [4]:
logging.info('List topics: %s', list_topics)

INFO:root:List topics: ['6732', '4084', '2638', '3245', '23709', '23708', '7798', '40266', '35786']


In [5]:
tags = list(set([98985,98985,98985,98985,123638,123638,98985,123638,123638]))

In [6]:
assert len(list_topics) == len(set(list_topics))
start = '2019-01-01T00:00:00'
end = '2019-05-01T00:00:00'
filters = (
    'q=*:*',
    'fq=-is_ignore:1',
    'fq=-is_noisy:1',
    'fq=is_approved:1',
    'wt=json',
    'fq=copied_at:[%sZ TO %sZ]' % (start, end),
    'fq=search_text:*',
    'fq=sentiment:*',
    'fq=tags:(%s)' % (' '.join([str(item) for item in tags]))
)
fields = ('id', 'copied_at', 'search_text', 'sentiment', 'sentiment_auto', 'tags', 'link', 'platform',
          'id_reference', 'created_date', 'mention_type', 'id_source', 'source_type')

df = pd.DataFrame()
for idx, topic in enumerate(list_topics):
    logging.info('Downloading %s/%s which is %s', idx + 1, len(list_topics), topic)
    try:
        df_tmp = solr.crawl_topic(domain='http://solrtopic.younetmedia.com', topic=topic, filters=filters,
                              fields=fields,
                              limit=int(5e4), batch_size=int(4e3+1), username='trind', password='Jhjhsdf$3&sdsd')

        df = df.append(df_tmp)
        logging.info('Topic: %s - No rows: %s', topic, df.shape[0])
    except KeyError as e:
        logging.exception('Error: %s', e)
        continue
    except json.decoder.JSONDecodeError as e:
        logging.exception('Error: %s', e)


INFO:root:Downloading 1/9 which is 6732
INFO:root:Crawled topic 6732 on page 1, 515/515 done
INFO:root:Crawled topic 6732 on page 2, 515/515 done
INFO:root:Topic: 6732 - No rows: 515
INFO:root:Downloading 2/9 which is 4084
INFO:root:Crawled topic 4084 on page 1, 614/614 done
INFO:root:Crawled topic 4084 on page 2, 614/614 done
INFO:root:Topic: 4084 - No rows: 1129
INFO:root:Downloading 3/9 which is 2638
INFO:root:Crawled topic 2638 on page 1, 1078/1078 done
INFO:root:Crawled topic 2638 on page 2, 1078/1078 done
INFO:root:Topic: 2638 - No rows: 2207
INFO:root:Downloading 4/9 which is 3245
INFO:root:Crawled topic 3245 on page 1, 1668/1668 done
INFO:root:Crawled topic 3245 on page 2, 1668/1668 done
INFO:root:Topic: 3245 - No rows: 3875
INFO:root:Downloading 5/9 which is 23709
INFO:root:Crawled topic 23709 on page 1, 364/364 done
INFO:root:Crawled topic 23709 on page 2, 364/364 done
INFO:root:Topic: 23709 - No rows: 4239
INFO:root:Downloading 6/9 which is 23708
INFO:root:Crawled topic 2370

In [7]:
df.shape

(4809, 14)

In [8]:
df.drop_duplicates(subset=['id'], inplace=True)

In [9]:
df.dropna(inplace=True, subset=['search_text'])

In [10]:
df.shape

(4663, 14)

In [11]:
df['mention'] = df['search_text'].map(lambda x: x[1])

In [12]:
df = df[df['mention'] != '']

In [13]:
df.shape

(4663, 15)

In [14]:
df.to_csv('/source/main/data_download/output/positive_class_11.csv', index=None)

# Temp

In [None]:
df_tags = pd.read_csv('/source/main/data_download/output/auxiliary/app_tags.csv')

In [None]:
df_tags[df_tags['id']==87294]

In [None]:
df_tags.dropna(subset=['id'], inplace=True)

In [None]:
df_tags = df_tags[df_tags['type']=='ATTRIBUTE']

In [None]:
df_tags = df_tags[df_tags['name'].map(lambda x: x.lower().find('taste') != -1)]

In [None]:
df_tags.dropna(inplace=True, subset=['id_sentiment_domain'])

In [None]:
df_tags = df_tags[df_tags['id_sentiment_domain'].map(lambda x: x.lower()=='milk' or x.lower()=='coffee_drink')]

In [None]:
pos = set(df_tags[df_tags['name'].map(lambda x: x.lower().find('positive') != -1)]['id'])
neutral = set(df_tags[df_tags['name'].map(lambda x: x.lower().find('neutral') != -1)]['id'])
neg = set(df_tags[df_tags['name'].map(lambda x: x.lower().find('negative') != -1)]['id'])

In [None]:
pos[:2]

In [None]:
neg[:2]

In [None]:
list(set(df_tags['id']))