In [2]:
import pandas as pd
import os
import gzip
from tqdm.autonotebook import tqdm
import re

In [2]:
def en_contraction_removal(text: str) -> str:
    apostrophe_handled = re.sub("’", "'", text)
    # from https://gist.githubusercontent.com/tthustla/74e99a00541264e93c3bee8b2b49e6d8/raw/599100471e8127d6efad446717dc951a10b69777/yatwapart1_01.py
    contraction_mapping = {
                    "i.e.": 'for example',
                    "e.g.": 'for example',
                    "youre": "you are",
                    "youll": "you will",
                    "theyre": "they are", "theyll": "they will",
                    "weve": "we have",
                    "shouldnt": "should not",
                    "dont": "do not",
                    "doesnt": "does not", "doesn": "does not",
                    "didnt": "did not",
                    "wasn": "was not",
                    "arent": "are not", "aren": "are not",
                    "aint": "is not", "isnt": "is not", "isn": "is not",
                    "wouldnt": "would not", "wouldn": "would not",
                    "ain't": "is not", "aren't": "are not","can't": "cannot", 
                   "can't've": "cannot have", "'cause": "because", "could've": "could have", 
                   "couldn't": "could not", "couldn't've": "could not have","didn't": "did not", 
                   "doesn't": "does not", "don't": "do not", "hadn't": "had not", 
                   "hadn't've": "had not have", "hasn't": "has not", "haven't": "have not", 
                   "he'd": "he would", "he'd've": "he would have", "he'll": "he will", 
                   "he'll've": "he will have", "he's": "he is", "how'd": "how did", 
                   "how'd'y": "how do you", "how'll": "how will", "how's": "how is", 
                   "I'd": "I would", "I'd've": "I would have", "I'll": "I will", 
                   "I'll've": "I will have","I'm": "I am", "I've": "I have", 
                   "i'd": "i would", "i'd've": "i would have", "i'll": "i will", 
                   "i'll've": "i will have","i'm": "i am", "i've": "i have", 
                   "isn't": "is not", "it'd": "it would", "it'd've": "it would have", 
                   "it'll": "it will", "it'll've": "it will have","it's": "it is", 
                   "let's": "let us", "ma'am": "madam", "mayn't": "may not", 
                   "might've": "might have","mightn't": "might not","mightn't've": "might not have", 
                   "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", 
                   "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", 
                   "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not",
                   "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", 
                   "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", 
                   "she's": "she is", "should've": "should have", "shouldn't": "should not", 
                   "shouldn't've": "should not have", "so've": "so have","so's": "so as", 
                   "this's": "this is",
                   "that'd": "that would", "that'd've": "that would have","that's": "that is", 
                   "there'd": "there would", "there'd've": "there would have","there's": "there is", 
                       "here's": "here is",
                   "they'd": "they would", "they'd've": "they would have", "they'll": "they will", 
                   "they'll've": "they will have", "they're": "they are", "they've": "they have", 
                   "to've": "to have", "wasn't": "was not", "we'd": "we would", 
                   "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", 
                   "we're": "we are", "we've": "we have", "weren't": "were not", 
                   "what'll": "what will", "what'll've": "what will have", "what're": "what are", 
                   "what's": "what is", "what've": "what have", "when's": "when is", 
                   "when've": "when have", "where'd": "where did", "where's": "where is", 
                   "where've": "where have", "who'll": "who will", "who'll've": "who will have", 
                   "who's": "who is", "who've": "who have", "why's": "why is", 
                   "why've": "why have", "will've": "will have", "won't": "will not", 
                   "won't've": "will not have", "would've": "would have", "wouldn't": "would not", 
                   "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would",
                   "y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",
                   "you'd": "you would", "you'd've": "you would have", "you'll": "you will", 
                   "you'll've": "you will have", "you're": "you are", "you've": "you have" }
    expanded = ' '.join([contraction_mapping[t.lower()] if t.lower() in contraction_mapping else t for t in apostrophe_handled.split(" ")])
    return expanded

In [3]:
import hunspell

hobj = hunspell.HunSpell('/Library/Spelling/en_US.dic', '/Library/Spelling/en_US.aff')


known_words = ['GMO', 'GMOs', 'Wal-Mart', 'Publix', 'Glyphosate', 'co2', 'Waitrose', '<URL>', 'certifier', 'TLDR', 'Coca~Cola', 'Quora', 'sci-fi']

for w in known_words:
    hobj.add(w)

In [4]:
url_regex = r'(?:http(s)?:\/\/)?[\w.-]+(?:\.[\w\.-]+)+[\w\-\._~:/?#[\]@!\$&\(\)\*\+,;=.]+'

def replace_urls_regex(sentence: str, url_token: str = '<URL>') -> str:
    return re.sub(url_regex, url_token, sentence)

def replace_urls(words, url_token: str = '<URL>'):
    return [url_token if (w.lower().startswith('www') or w.lower().startswith('http')) else w for w in words]

def spellcheck_sentence(row) -> str:
    sent = row['Sentence']
    #print(sent)
    to_remove = [',', '(', ')', ':', '?', '&', '/', '*']
    for tr in to_remove:
        sent = sent.replace(tr, ' ')
        
    sent = sent.replace('€™', "'")
    sent = sent.replace('�', "'")
    sent = en_contraction_removal(sent)
    sent = sent.replace("'", ' ')
    sent = replace_urls_regex(sent)


    tokens = sent.split(' ')
    result = []
    for t in tokens:
        if t == ' ':
            continue
        if not hobj.spell(t):
            suggestions = hobj.suggest(t)
            if not suggestions:
                result.append(t)
            else:
                if suggestions[0] == 'e':
                    result.append(t)
                    continue
                result.append(suggestions[0])
                print(f'{t} -> {suggestions[0]}')
        else:
            result.append(t)
    return ' '.join(result)
        
#spellcheck_sentence('This is a tset with a wong wod. Adn now anotheer one why does this notjn workd')

In [5]:
splits = ['train', 'validation', 'test']
path = os.path.join(os.getcwd(), 'data', 'data', 'organic2019')

for s in splits:
    print('Split: ' + str(s))
    fn = os.path.join(path, s + '.csv')
    df = pd.read_csv(fn, sep='|') #
    df['Sentence'] = df.apply(spellcheck_sentence, axis=1)
    
    fn = os.path.join(path, s + '_sp.csv')
    df.to_csv(fn, sep='|', index=False)

Split: train
externalities -> externalizes
Walkin -> Walking
DeKalb -> Kalb
labour -> labor
NPOP -> POP
NOP -> BOP
NPOP -> POP
bestest -> best est
bestest -> best est
amoungst -> amount
enviroments -> environments
enviroments -> environments
knowlege -> knowledge
metioned -> mentioned
deminish -> diminish
monoculture -> mono culture
fertilisers -> fertilizers
‘dead -> dead
labour -> labor
vegitarians -> vegetarians
superbugs -> super bugs
‘catching -> catching
superbug -> super bug
fertilisers -> fertilizers
benifits -> benefits
fradulent -> fraudulent
Muricans -> Americans
buds! -> buds
$3 -> 3
$1 -> 1
embryonated -> embryonic
review406 -> reviewer
macerators -> accelerators
poults -> pouts
Organisation -> Organization
Union.4 -> Union
food[1]. -> food
aflatoxin. -> antitoxin
‚Äúscientists‚Äù -> conscientiousness
supporter‚Äù -> supporter
aren‚Äôt -> aren't
can‚Äôt -> cantata
aren‚Äôt -> aren't
that‚Äôs -> thatch
seedstock -> seed stock
don‚Äôt -> donation
there‚Äôs -> Therese
america

labelled -> labeled
UHT -> HUT
Darlington -> Darling ton
+6 -> 6
teperature -> temperature
recognised -> recognized
recognises -> recognizes
UAE. -> USE
yeild -> yield
eco -> Eco
craftsmenship. -> craftsmanship
Thoughtscapism -> Thoughtfulness
minimise -> minimize
IPM -> IMP
minimise -> minimize
IPM -> IMP
IPM -> IMP
1% -> 1
“Everything -> everything
“natural” -> natural
“natural” -> natural
Bt -> Br
Bt-biotech -> Br-biotech
labour -> labor
eco-friendly. -> Eco-friendly
bioengineering -> bio engineering
certifiers -> rectifiers
Swinney -> Spinney
EatingExpectantly -> Eating Expectantly
nitrous -> nitro us
agriculture—typically -> electromagnetically
agriculture—impose -> agriculturalist
“ammonia -> ammonia
nitrous -> nitro us
systems” -> systems
“land -> land
acidification -> solidification
unit.” -> unity
Colors! -> Colors
organicness. -> organics
NPOP -> POP
BioSuisse -> Biosensor
certifcatin -> certification
price! -> price
PGS -> PG
PGS -> PG
PGS -> PG
Kolkata -> Katakana
Jamshedpu

“non-organic” -> non-organic”
“inorganic”. -> inorganic
“Organic -> organic
harmony.” -> harmony
NOSB -> NOBS
harmony.” -> harmony
NOSB -> NOBS
harmony.” -> harmony
NOSB -> NOBS
Strough -> Trough
Strough -> Trough
vegitables -> vegetables
Strough -> Trough
farmstand -> farm stand
california -> California
florida -> Florida
IOFGA -> FIGARO
eco-system -> Eco-system
food! -> food
fertilisers -> fertilizers
fertilisers -> fertilizers
fertilisers -> fertilizers
labelled -> labeled
organic=pesticide -> disorganization
labelled -> labeled
labelled -> labeled
organice -> organic
“organic” -> organic
cancer[1]. -> cancer
Flavonones -> Flagstones
eco-friendly -> Eco-friendly
list! -> list
“organic” -> organic
apeda -> aped
organisation. -> organization
labour -> labor
‚Äúorganic‚Äù -> organically
bioengineered -> bio engineered
se -> SE
[Am -> Am
Clin -> Cline
Nutr. -> Nut
Antimicrobials -> Antimicrobial
Shailendra -> Handrail
Dhakad -> Dhaka
A2A. -> AHA
ol -> lo
throughly -> thoroughly
sceptica

convential -> conventional
eco-agriculture -> Eco-agriculture
dataset -> data set
Badgley -> Bradley
paleoecologist -> paleontologist
kilocalories -> kilo calories
extrenely -> extremely
guage -> gauge
Berkley -> Berkeley
fertiziler -> fertilizer
notrogen -> nitrogen
dependance -> dependence
Maikaal -> Baikal
socio-economic -> sociology-economic
deromanticize -> romanticize
eco -> Eco
eco-friendly -> Eco-friendly
amout -> amour
neonicotinoid -> nonidentical
agrochemical -> petrochemical
fertilised -> fertilized
fertilised -> fertilized
fertiliser. -> fertilizer
monocultures -> mono cultures
“pesticides” -> pesticide
“bulked -> bulked
up” -> up
Deekshith -> Deanship
socio-economic -> sociology-economic
intra- -> intro-
improvethe -> improve the
withoutincreasing -> without increasing
growingour -> growing our
processsuch -> process such
permaculture -> aquaculture
aquaponics. -> quadraphonic
valuableskill. -> valuable skill
carbonfootprint -> carbon footprint
carbonfootprint -> carbon f

Clewiston -> Pleistocene
Des -> Es
Moines -> Moises
napa -> naps
flavouring. -> flavoring
porridges -> porridge
doughs. -> soughs
flavouring -> flavoring
discussion! -> discussion
phytonutrients -> nutritionists
glyphosate -> Glyphosate
CAFO -> CAFE
“inorganic” -> inorganic
guar -> gar
xantham -> lanthanum
monocultures -> mono cultures
pesticides!. -> pesticides
FIFRA -> RIFFRAFF
rotenone -> rote none
pyrethrin -> preshrink
toxification. -> detoxification
be! -> be
be! -> be
shing -> shin
energy! -> energy
alimentation. -> implementation
dixine -> divine
eggs… -> eggs
Celiac -> Celia
brainer -> brainier
Nagar -> Agar
benefot -> benefit
2X -> 2
snarky! -> snarky
willy-nilly -> willy-billy
you‚Äôre -> you're
psycological -> psychological
kool-aid. -> look-aid
evidense -> evidence
HEALTHY! -> HEALTHY
ok -> OK
exerpt -> exert
ssys -> says
fertilisers -> fertilizers
pyrethrins -> preshrinks
sulphur -> sulfur
‘recipes -> recipes
chillies -> sillies
castile -> castle
chillies. -> sillies
IARC

Gugaon -> Gauguin
recieving -> receiving
Gurgaon. -> Surgeon
Mascon -> Mason
Mascon -> Mason
forcooking. -> for cooking
‚Äúorganic‚Äù. -> organically
what‚Äôs -> whatsits
bigbasket -> big basket
baazarcart -> barcarole
Modmarket -> Mod market
flavour -> flavor
mayhaps. -> mishaps
favour. -> favor
chai -> chi
CHAI -> CHI
Ora‚Äôs -> Orators
superfoods -> super foods
acai -> acacia
spirulina. -> spiritual
6g -> 6
carb -> crab
You‚Äôre -> You're
protein-to-carb -> protein-to-crab
flavour -> flavor
better! -> better
celery! -> celery
they‚Äôre -> they're
‚Äúorganic‚Äù -> organically
Yuki -> Yuri
will! -> will
ageing -> aging
fibre -> fiber
fibre -> fiber
levelled -> leveled
fibres -> fibers
fibres -> fibers
favourite -> favorite
Here‚Äôs -> Heresy
they‚Äôre -> they're
they‚Äôre -> they're
flavour. -> flavor
flavour. -> flavor
flavour -> flavor
flavour -> flavor
flavour. -> flavor
Macleans -> Mac leans
Brix -> Brie
brix -> brie
brix -> brie
refractometer -> refractory
brix -> brie
brix -> br

pantryperks -> pantry perks
Theo -> Thea
Lundberg -> Lindbergh
rich-flavoured -> rich-flavored
Dalbasket. -> Basketball
TMI -> TI
COLMAN -> CONMAN
arhar -> harsh
dal -> lad
chana -> kana
dal -> lad
lobia -> labia
kabuli -> kabuki
chana -> kana
urad -> rad
dal -> lad
moong -> mung
dal -> lad
masoor -> masonry
dal -> lad
malka -> alkali
dal -> lad
desi -> dies
kala -> koala
chana -> kana
sabut -> abut
moong -> mung
sabut -> abut
masoor -> masonry
Dalbasket -> Basketball
flavoured -> flavored
dals -> lads
Exporteurs -> Exporters
Importeurs -> Importers
Exporteurs -> Exporters
Importeurs -> Importers
Certifed -> Certified
Northbourne -> Northbound
farming—in -> farming
withfeed -> with feed
defatted -> defalcated
flavour-enhancers -> flavor-enhancers
colouring-agents.<URL> -> coloring-agents.<URL>
labelling -> labeling
fertilisers -> fertilizers
subsidised -> subsidized
colonised -> colonized
fertilisers -> fertilizers
subsidised -> subsidized
fail! -> fail
higher! -> higher
localvore-styl

Redueces -> Reduces
Eatofresh-First -> Freshwater-First
Mohali -> Mohair
Panchkula -> Launchpad
McD -> MD
pre-cooked -> per-cooked
Organica -> Organics
ayurvedic -> predicative
un-ripe -> UN-ripe
pyrethrum -> pyre thrum
pyrethrum -> pyre thrum
difference! -> difference
cahunas -> kahunas
certifiers -> rectifiers
portapotties -> portieres
Rotenone -> Rote none
rotenone -> rote none
neighbouring -> neighboring
“organic” -> organic
“a -> a
expensive.” -> expensiveness
;- -> e-
EU-eco -> EU-Eco
worl -> work
reposting -> reposing
organic=healthy -> organically
catechins -> catechist
anti-ageing -> anti-aging
TeaDerived -> Tea Derived
Sinensis -> Insistence
flavour -> flavor
colour. -> color
Polyphenols -> Polyphemus
phytochemicals -> petrochemicals
TeaWhile -> Te Awhile
Sinensis -> Insistence
polyphenols. -> polyphony
polyphenols -> polyphony
Theanine -> Thea nine
TeaThe -> Tea The
oneâ -> one
Oolong -> Oblong
TeaThis -> Tea This
lipase -> slipcase
oolong -> oblong
TeaBrewed -> Tea Brewed
f

In [6]:
df[df['Sentence'].isnull()]

Unnamed: 0.1,Unnamed: 0,index,Author_ID,Author_name,Comment_number,Sentence_number,Domain_Relevance,Sentiment,Entity,Attribute,Sentence,Source_file,Aspect


# Exploration

In [4]:
path = os.path.join(os.getcwd(), 'data', 'data', 'organic2019')

In [45]:
df_train = pd.read_csv(os.path.join(path, 'train_sp.csv'), sep='|')
df_val = pd.read_csv(os.path.join(path, 'validation_sp.csv'), sep='|')
df_test = pd.read_csv(os.path.join(path, 'test_sp.csv'), sep='|')
df_test

Unnamed: 0.1,Unnamed: 0,index,Author_ID,Author_name,Comment_number,Sentence_number,Domain_Relevance,Sentiment,Entity,Attribute,Sentence,Source_file,Aspect
0,5948,699,Metin-Ozsavran,Metin Ozsavran,740.0,1.0,9,n,g,g,Fresh foods are basically non-tolerable,quora.json,g-g
1,5949,700,Metin-Ozsavran,Metin Ozsavran,740.0,2.0,0,,,,Therefore its not fitting to think in terms of...,quora.json,
2,5950,701,Metin-Ozsavran,Metin Ozsavran,740.0,3.0,9,0,g,p,Price totally depends on current supply level ...,quora.json,g-p
3,5951,702,Metin-Ozsavran,Metin Ozsavran,740.0,4.0,9,0,cp,av,Demand for staples like tomatoes will be ever ...,quora.json,cp-av
4,5952,703,Metin-Ozsavran,Metin Ozsavran,740.0,5.0,9,n,cp,c,Price of organics depends on price of conventi...,quora.json,cp-c
5,5953,704,Metin-Ozsavran,Metin Ozsavran,740.0,6.0,9,n,p,p,If difference is higher than 50% consumers st...,quora.json,p-p
6,5954,705,Metin-Ozsavran,Metin Ozsavran,740.0,7.0,9,0,g,l,So you need to find your own local ways of usi...,quora.json,g-l
7,5955,706,Metin-Ozsavran,Metin Ozsavran,740.0,8.0,0,,,,Old timer experience and wisdom needed,quora.json,
8,5956,707,Metin-Ozsavran,Metin Ozsavran,740.0,9.0,9,p,g,g,For veggies you need min,quora.json,g-g
9,5957,708,Metin-Ozsavran,Metin Ozsavran,740.0,10.0,0,,,,50% gross margin in farming business if not h...,quora.json,


In [33]:
# number of quora sources
df_train[df_train['Source_file'] == 'quora.json'].count()['index']/df_train.count()['index']

0.9998878546596389

In [35]:
df_train.count()

Unnamed: 0          8917
index               8917
Author_ID           8917
Author_name         8917
Comment_number      8917
Domain_Relevance    8917
Sentiment           4779
Entity              4779
Attribute           4779
Sentence            8917
Source_file         8916
Aspect              4778
dtype: int64

In [36]:
df_val[df_val['Source_file'] == 'quora.json'].count()['index']/df_val.count()['index']

1.0

In [37]:
df_test[df_test['Source_file'] == 'quora.json'].count()['index']/df_test.count()['index']

1.0

In [38]:
df_train[df_train['Source_file'] != 'quora.json']

Unnamed: 0_level_0,Unnamed: 0,index,Author_ID,Author_name,Comment_number,Domain_Relevance,Sentiment,Entity,Attribute,Sentence,Source_file,Aspect
Sentence_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
25.0,812,812,Justin-Ma,Justin Ma,1317.0,9,p,c,National Organic Program The comments should b...,Quora son,,


# number of samples & comments

In [39]:
df_train.count()['index'] + df_val.count()['index'] + df_test.count()['index']

10439

In [40]:
df_train.groupby('Comment_number').sum()

Unnamed: 0_level_0,Unnamed: 0,index,Domain_Relevance
Comment_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1.0,65436,21,36
2.0,215349,414,207
3.0,65646,231,45
4.0,37534,154,36
6.0,94047,597,81
7.0,84735,630,72
8.0,37686,306,36
9.0,28275,240,27
11.0,151000,1480,144
15.0,236775,3150,144


In [29]:
df_val.groupby('Comment_number').sum()

Unnamed: 0_level_0,Unnamed: 0,index,Sentence_number,Domain_Relevance
Comment_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
14.0,28371,336,5.0,27
25.0,19115,425,3.0,18
43.0,9761,416,1.0,9
49.0,58923,2853,21.0,36
57.0,29799,1764,6.0,0
60.0,19919,1229,3.0,9
71.0,60323,4253,23.0,9
78.0,30306,2271,6.0,0
96.0,40680,3300,12.0,18
128.0,20683,1993,5.0,18


In [30]:
df_test.groupby('Comment_number').sum()

Unnamed: 0_level_0,Unnamed: 0,index,Sentence_number,Domain_Relevance
Comment_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
5.0,131495,665,85.0,126
10.0,28284,249,6.0,18
12.0,56691,621,21.0,9
13.0,37814,434,10.0,36
22.0,152648,3128,69.0,144
33.0,67424,2009,19.0,54
45.0,19545,855,5.0,9
61.0,19923,1233,3.0,18
64.0,49850,3125,15.0,45
94.0,30489,2454,6.0,27


In [52]:
df_train[df_train.duplicated(['Sentence'], keep=False)]

Unnamed: 0.1,Unnamed: 0,index,Author_ID,Author_name,Comment_number,Sentence_number,Domain_Relevance,Sentiment,Entity,Attribute,Sentence,Source_file,Aspect
43,9566,221,Oliver-Leslie,Oliver Leslie,27.0,1.0,9,n,f,or,it is getting to a point where growing your ow...,quora.json,f-or
44,9567,222,Oliver-Leslie,Oliver Leslie,27.0,1.0,9,n,cf,ll,it is getting to a point where growing your ow...,quora.json,cf-ll
49,9572,227,Oliver-Leslie,Oliver Leslie,27.0,6.0,9,n,p,c,Hey the best est apple in the world still has...,quora.json,p-c
50,9573,228,Oliver-Leslie,Oliver Leslie,27.0,6.0,9,n,cp,c,Hey the best est apple in the world still has...,quora.json,cp-c
51,9574,229,Oliver-Leslie,Oliver Leslie,27.0,7.0,0,,,,But growing your own also has issues - who ...,quora.json,
52,9575,230,Oliver-Leslie,Oliver Leslie,27.0,8.0,0,,,,Are the seeds tampered with,quora.json,
53,9576,231,Oliver-Leslie,Oliver Leslie,27.0,7.0,9,n,g,g,But growing your own also has issues - who ...,quora.json,g-g
54,9577,232,Oliver-Leslie,Oliver Leslie,27.0,7.0,9,n,cg,or,But growing your own also has issues - who ...,quora.json,cg-or
55,9578,233,Oliver-Leslie,Oliver Leslie,27.0,8.0,0,,,,Are the seeds tampered with,quora.json,
56,9579,234,Oliver-Leslie,Oliver Leslie,27.0,9.0,9,n,g,ll,Who the hell knows any more,quora.json,g-ll


## Count of sentences with two or more aspects

In [55]:
df_train[df_train.duplicated(['Sentence'], keep='first')].count()['index']+df_val[df_val.duplicated(['Sentence'], keep='first')].count()['index']+df_test[df_test.duplicated(['Sentence'], keep='first')].count()['index']

668

## Count of domain relevant sentences

In [59]:
df_train[df_train['Domain_Relevance'] != 0].count()['index']+df_test[df_test['Domain_Relevance'] != 0].count()['index']+df_val[df_val['Domain_Relevance'] != 0].count()['index']

5560

In [60]:
df_train[df_train.duplicated(['Aspect'], keep='first')].count()['index']+df_val[df_val.duplicated(['Aspect'], keep='first')].count()['index']+df_test[df_test.duplicated(['Aspect'], keep='first')].count()['index']

10184

In [72]:
df_train_rel = df_train[df_train['Domain_Relevance'] == 9]
df_train_rel[df_train.duplicated(['Sentiment'], keep='last')]

  


Unnamed: 0.1,Unnamed: 0,index,Author_ID,Author_name,Comment_number,Sentence_number,Domain_Relevance,Sentiment,Entity,Attribute,Sentence,Source_file,Aspect
0,5282,33,Robert-Wagner-2,Robert Wagner,655.0,1.0,9,0,g,av,Companies selling the largest volume of organi...,quora.json,g-av
1,5283,34,Robert-Wagner-2,Robert Wagner,655.0,2.0,9,0,cc,av,Which is best depends on your receptiveness to...,quora.json,cc-av
2,5284,35,Robert-Wagner-2,Robert Wagner,655.0,3.0,9,p,g,h,Companies are in business to make money not d...,quora.json,g-h
3,5285,36,Robert-Wagner-2,Robert Wagner,655.0,4.0,9,p,p,h,I dislike Whole Foods because it fosters the i...,quora.json,p-h
4,5286,37,Robert-Wagner-2,Robert Wagner,655.0,5.0,9,p,c,g,My picks for best are Trader Joe and HEB TX,quora.json,c-g
8,1026,27,Justin-Ma,Justin Ma,521.0,4.0,9,p,cg,pp,Industrialization is everything about producti...,quora.json,cg-pp
11,1029,30,Justin-Ma,Justin Ma,521.0,7.0,9,0,g,g,Organic agriculture goes along with that the...,quora.json,g-g
24,10307,962,Daniel-Woodard,Daniel Woodard,125.0,2.0,9,n,cc,t,The big supermarkets Kroger and Publix in Atl...,quora.json,cc-t
25,10308,963,Daniel-Woodard,Daniel Woodard,125.0,3.0,9,p,c,q,The meat and seafood is good at Whole Foods as...,quora.json,c-q
28,10314,969,Daniel-Woodard,Daniel Woodard,125.0,7.0,9,n,c,p,it is really expensive,quora.json,c-p


In [67]:
df_train[df_train['Aspect'] != None]

Unnamed: 0.1,Unnamed: 0,index,Author_ID,Author_name,Comment_number,Sentence_number,Domain_Relevance,Sentiment,Entity,Attribute,Sentence,Source_file,Aspect
0,5282,33,Robert-Wagner-2,Robert Wagner,655.0,1.0,9,0,g,av,Companies selling the largest volume of organi...,quora.json,g-av
1,5283,34,Robert-Wagner-2,Robert Wagner,655.0,2.0,9,0,cc,av,Which is best depends on your receptiveness to...,quora.json,cc-av
2,5284,35,Robert-Wagner-2,Robert Wagner,655.0,3.0,9,p,g,h,Companies are in business to make money not d...,quora.json,g-h
3,5285,36,Robert-Wagner-2,Robert Wagner,655.0,4.0,9,p,p,h,I dislike Whole Foods because it fosters the i...,quora.json,p-h
4,5286,37,Robert-Wagner-2,Robert Wagner,655.0,5.0,9,p,c,g,My picks for best are Trader Joe and HEB TX,quora.json,c-g
5,1023,24,Justin-Ma,Justin Ma,521.0,1.0,0,,,,Thanks for the thoughtful response,quora.json,
6,1024,25,Justin-Ma,Justin Ma,521.0,2.0,0,,,,I think we actually have a lot of common groun...,quora.json,
7,1025,26,Justin-Ma,Justin Ma,521.0,3.0,0,,,,All I want to emphasize are my main points Pr...,quora.json,
8,1026,27,Justin-Ma,Justin Ma,521.0,4.0,9,p,cg,pp,Industrialization is everything about producti...,quora.json,cg-pp
9,1027,28,Justin-Ma,Justin Ma,521.0,5.0,0,,,,Creating jobs at the expense of efficiency is ...,quora.json,
