In [1]:
# Import libraries
import pandas as pd
import nltk
import string
# Create dataframe from loaded datasets
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df = pd.concat([df_train, df_test], ignore_index = True)
print("Shape of data is", df.shape)


Shape of data is (18452, 7)


In [2]:
df.head()

Unnamed: 0,ID,A1,A2,A3,A4,A5,Score
0,14103__worth_9__free_5,1,1,1,0,0,0.369248
1,4603__fretted_10__mother_16,3,3,2,2,0,1.436513
2,3706__attitudes_14__lax_12,3,3,2,1,0,0.994271
3,3098__settlement_14__achieve_12,2,1,1,0,0,0.378663
4,716__mask_30__stripped_28,2,2,2,1,1,0.722123


In [3]:
# print(str(df['ID'][0]).split('_')[2])
# print(str(df['ID'][0]).split('_')[5])

# print(str(df['ID'][0]).split('_')[0])


In [4]:
# Create a column to store the sentence ID
df['Sentence_ID'] = df['ID'].apply(lambda x : str(x).split('_')[0])

def get_head(ID_string):
  ID_string_split = ID_string.split('_')
  if ID_string_split[3] > ID_string_split[6]:
    return ID_string_split[5]
  return ID_string_split[2]

def get_tail(ID_string):
  ID_string_split = ID_string.split('_')
  if ID_string_split[3] < ID_string_split[6]:
    return ID_string_split[5]
  return ID_string_split[2]

# Create two columns for storing the two words of each pair of the dataset
df['Head'] = df['ID'].apply(lambda x : get_head(x))
df['Tail'] = df['ID'].apply(lambda x : get_tail(x))


# df['Word_1'] = df['ID'].apply(lambda x : str(x).split('_')[2])
# df['Word_2'] = df['ID'].apply(lambda x : str(x).split('_')[5])
# df['Sentence_ID'] = df['ID'].apply(lambda x : int(str(x).split('_')[0]))
# print(type(df['Sentence_ID']))
# max = df['Sentence_ID'].max()
# print(max)

In [5]:
df.head()

Unnamed: 0,ID,A1,A2,A3,A4,A5,Score,Sentence_ID,Head,Tail
0,14103__worth_9__free_5,1,1,1,0,0,0.369248,14103,free,worth
1,4603__fretted_10__mother_16,3,3,2,2,0,1.436513,4603,fretted,mother
2,3706__attitudes_14__lax_12,3,3,2,1,0,0.994271,3706,lax,attitudes
3,3098__settlement_14__achieve_12,2,1,1,0,0,0.378663,3098,achieve,settlement
4,716__mask_30__stripped_28,2,2,2,1,1,0.722123,716,stripped,mask


In [6]:
# Download corpus and tagset for POS tagging
nltk.download('brown')
nltk.download('universal_tagset')


[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.


True

In [7]:
# Find the tags for each word in the corpus
wordtags = nltk.ConditionalFreqDist((w.lower(), t) 
        for w, t in nltk.corpus.brown.tagged_words(tagset="universal"))

In [8]:
# Find POS tags for each word from the dataset by comparing with tags from corpus
df['Head_tags'] = df['Head'].apply(lambda x : set(wordtags[str(x)]))
df['Tail_tags'] = df['Tail'].apply(lambda x : set(wordtags[str(x)]))

In [9]:
df.head()

Unnamed: 0,ID,A1,A2,A3,A4,A5,Score,Sentence_ID,Head,Tail,Head_tags,Tail_tags
0,14103__worth_9__free_5,1,1,1,0,0,0.369248,14103,free,worth,"{VERB, ADJ, ADV}","{ADJ, NOUN}"
1,4603__fretted_10__mother_16,3,3,2,2,0,1.436513,4603,fretted,mother,{VERB},"{VERB, NOUN}"
2,3706__attitudes_14__lax_12,3,3,2,1,0,0.994271,3706,lax,attitudes,{ADJ},{NOUN}
3,3098__settlement_14__achieve_12,2,1,1,0,0,0.378663,3098,achieve,settlement,{VERB},{NOUN}
4,716__mask_30__stripped_28,2,2,2,1,1,0.722123,716,stripped,mask,{VERB},"{VERB, NOUN}"


In [10]:
# Used for creating Noun_pair truth value column
def is_noun_pair(word_1_tag, word_2_tag, score):
  # if word_1_tag and word_2_tag:
    # if word_1_tag[0] == 'NOUN' and word_2_tag[0] == 'NOUN' and score >= 1.1:
  if 'NOUN' in word_1_tag and 'NOUN' in word_2_tag and score > 0:
    return True
  return False

In [11]:
df['Noun_pair'] = df.apply(lambda x : is_noun_pair(x.Head_tags, x.Tail_tags, x.Score), axis = 1)
# for i in range(len(df)):
#   if 'NOUN' in df['Word 1 tags'][i] and 'NOUN' in df['Word 2 tags'][i]:
#     df['Noun pair'][i] = True
#   else:
#     df['Noun pair'][i] = False

In [12]:
# Download package for lemmatization
nltk.download('omw-1.4')
nltk.download('wordnet')

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [13]:
# Store lemmatized forms for the nouns from each noun-pair of the dataset
wnl = nltk.stem.WordNetLemmatizer()
def call_lemmatize(noun_pair, x):
  if noun_pair:
    return wnl.lemmatize(x)
  return None
df['Head_noun_lemmatised'] = df.apply(lambda x : call_lemmatize(x.Noun_pair, x.Head), axis = 1)
df['Tail_noun_lemmatised'] = df.apply(lambda x : call_lemmatize(x.Noun_pair, x.Tail), axis = 1)
df.head()

Unnamed: 0,ID,A1,A2,A3,A4,A5,Score,Sentence_ID,Head,Tail,Head_tags,Tail_tags,Noun_pair,Head_noun_lemmatised,Tail_noun_lemmatised
0,14103__worth_9__free_5,1,1,1,0,0,0.369248,14103,free,worth,"{VERB, ADJ, ADV}","{ADJ, NOUN}",False,,
1,4603__fretted_10__mother_16,3,3,2,2,0,1.436513,4603,fretted,mother,{VERB},"{VERB, NOUN}",False,,
2,3706__attitudes_14__lax_12,3,3,2,1,0,0.994271,3706,lax,attitudes,{ADJ},{NOUN},False,,
3,3098__settlement_14__achieve_12,2,1,1,0,0,0.378663,3098,achieve,settlement,{VERB},{NOUN},False,,
4,716__mask_30__stripped_28,2,2,2,1,1,0.722123,716,stripped,mask,{VERB},"{VERB, NOUN}",False,,


In [14]:
df.shape

(18452, 15)

In [15]:
# Dictionary storing lemmatized nouns from the noun-pairs as keys and sets containing the corresponding adjectives as values
nouns_pair_word_adj_dict = {}
for row in  df.itertuples(index = False):
  # Initialise the set corresponding to each lemmatized noun of each noun-pair to the empty set
  if row.Noun_pair:
    # Commented out
    nouns_pair_word_adj_dict[row.Head_noun_lemmatised] = set()
    nouns_pair_word_adj_dict[row.Tail_noun_lemmatised] = set()
    # Commented out end
    # Remove this
    # nouns_pair_word_adj_dict[row.Word_1] = set()
    # nouns_pair_word_adj_dict[row.Word_2] = set()
    # Remove this end
# # words_set = set()
# # for word in df['Word_1']:
# #   words_set.add(word)
# # for word in df['Word_2']:
# #   words_set.add(word)

# Write the unique nouns to a file
# with open('words.txt', mode = 'w') as unique_words_file:
#   for word in nouns_pair_word_adj_dict:
#     unique_words_file.write(word + '\n')
# df.to_csv('Output.csv')

In [16]:
# Count the noun pairs from the given word pairs
noun_pair_count = 0
for i in df.Noun_pair:
  if i:
    noun_pair_count += 1

In [17]:
noun_pair_count

6135

In [18]:
# Read adjective-noun pairs file
# Add adjectives to the respective lemmatized nouns' dictionary entry
with open('3244ANPs.txt') as file:
  for line in file:
    split_line = line.split('_')
    if len(split_line) == 2:
      adj_part, noun_part = line.split('_')
      adj = adj_part.split()[-1]
      # Commented out
      noun = wnl.lemmatize(noun_part.split()[0])
      # Commented out end
      # Remove this
      # noun = noun_part.split()[0]
      # Remove this end
      
      if noun in nouns_pair_word_adj_dict:
        (nouns_pair_word_adj_dict[noun]).add(adj)

df_anp = pd.read_csv('AN-phrase-annotations.csv')
count = 0
for row in df_anp.itertuples(index = False):
  noun = wnl.lemmatize(row.noun.split('_')[0])
  adj = row.adjective.split('_')[0]
  if noun in nouns_pair_word_adj_dict:
    nouns_pair_word_adj_dict[noun].add(adj)


print(nouns_pair_word_adj_dict)    
    # if (len(split_line) == 2):
    #   if split_line[1] in nouns_pair_word_adj_dict:
    #     nouns_pair_word_adj_dict[split_line[1]].add(split_line[0].strip())      



In [19]:
len(nouns_pair_word_adj_dict['hair'])

37

In [20]:
anp_count = 0
for noun in nouns_pair_word_adj_dict:
  if nouns_pair_word_adj_dict[noun]:
    anp_count += 1
print(anp_count)

1466


In [21]:
def get_common_adjectives(noun_1, noun_2):
  if noun_1 in nouns_pair_word_adj_dict and noun_2 in nouns_pair_word_adj_dict:
    return nouns_pair_word_adj_dict[noun_1].intersection(nouns_pair_word_adj_dict[noun_2])
  return None
# Commented out
df['Common_adjectives'] = df.apply(lambda x : get_common_adjectives(x.Head_noun_lemmatised, x.Tail_noun_lemmatised), axis = 1)
# Commented out end
# Remove this
# df['Common_adjectives'] = df.apply(lambda x : get_common_adjectives(x.Word_1, x.Word_2), axis = 1)
# Remove this end


In [22]:
'''df['Words_score_sentenceID_set'] = df.apply(lambda x : set((x.Sentence_ID, x.Noun_1_lemmatised,
                                                            x.Noun_2_lemmatised, x.Score)), axis = 1)
'''
# def get_word_score_sentenceID_tuple(sentenceID, noun1, noun2, score):
#   temp = (sentenceID, noun1, noun2, score)
#   temp = sorted(str(i) for i in temp)
#   return tuple(temp)



# df['Words_score_sentenceID_tuple'] = df.apply(lambda x : get_word_score_sentenceID_tuple(x.Sentence_ID, x.Noun_1_lemmatised,
#                                                                                          x.Noun_2_lemmatised, x.Score), axis = 1)


# df['Words_score_sentenceID_tuple'] = df.apply(lambda x : tuple(sorted([x['Sentence_Id'], x['Noun_1_lemmatised'],
#                                                                        x['Noun_2_lemmatised'], x['Score']])), axis = 1)

"df['Words_score_sentenceID_set'] = df.apply(lambda x : set((x.Sentence_ID, x.Noun_1_lemmatised,\n                                                            x.Noun_2_lemmatised, x.Score)), axis = 1)\n"

In [23]:
df.head()

Unnamed: 0,ID,A1,A2,A3,A4,A5,Score,Sentence_ID,Head,Tail,Head_tags,Tail_tags,Noun_pair,Head_noun_lemmatised,Tail_noun_lemmatised,Common_adjectives
0,14103__worth_9__free_5,1,1,1,0,0,0.369248,14103,free,worth,"{VERB, ADJ, ADV}","{ADJ, NOUN}",False,,,
1,4603__fretted_10__mother_16,3,3,2,2,0,1.436513,4603,fretted,mother,{VERB},"{VERB, NOUN}",False,,,
2,3706__attitudes_14__lax_12,3,3,2,1,0,0.994271,3706,lax,attitudes,{ADJ},{NOUN},False,,,
3,3098__settlement_14__achieve_12,2,1,1,0,0,0.378663,3098,achieve,settlement,{VERB},{NOUN},False,,,
4,716__mask_30__stripped_28,2,2,2,1,1,0.722123,716,stripped,mask,{VERB},"{VERB, NOUN}",False,,,


In [24]:
df.shape

(18452, 16)

In [25]:
# Commented out
# df.to_csv('Output.csv')
# Commented out end

df.to_csv('Output_all.csv')


# df = df.drop(df[df.Common_adjectives == set()].index)
# Removing rows where there is no common adjective in the two nouns
df = df[df.Common_adjectives != set()]
df = df[df.Common_adjectives.notnull()]
df = df.drop(df[df.Head_noun_lemmatised == df.Tail_noun_lemmatised].index)
df = df.drop_duplicates(subset = ['Head_noun_lemmatised', 'Tail_noun_lemmatised'])
# df = df.drop_duplicates(subset = "Words_score_sentenceID_tuple")
# df = df.Words_score_sentenceID_set[~(df.Words_score_sentenceID_set.astype(str).duplicated)]
# df = df.drop(df[df.Common_adjectives is None].index)
df.to_csv('Output_limited.csv')

In [26]:
df.shape

(1063, 16)

In [27]:
df.head()

Unnamed: 0,ID,A1,A2,A3,A4,A5,Score,Sentence_ID,Head,Tail,Head_tags,Tail_tags,Noun_pair,Head_noun_lemmatised,Tail_noun_lemmatised,Common_adjectives
23,838__industry_17__image_19,2,1,1,1,1,0.660718,838,industry,image,{NOUN},{NOUN},True,industry,image,{strong}
26,6391__science_2__image_6,2,1,1,0,0,0.370467,6391,science,image,{NOUN},{NOUN},True,science,image,"{soft, sweet, weak, bright, strong}"
27,4569__potential_7__role_8,3,2,1,1,0,0.775985,4569,potential,role,"{ADJ, NOUN}",{NOUN},True,potential,role,{clear}
47,6198__competition_39__stages_44,1,1,0,0,0,0.253645,6198,competition,stages,{NOUN},{NOUN},True,competition,stage,{bitter}
50,3734__potential_19__fields_21,1,1,1,1,0,0.508778,3734,potential,fields,"{ADJ, NOUN}","{VERB, NOUN}",True,potential,field,{clear}


In [28]:
triples = set()
for row in df.itertuples(index = False):
  for adj in row.Common_adjectives:
    triples.add((row.Head_noun_lemmatised, adj, row.Tail_noun_lemmatised))

In [29]:
with open("triples.txt", mode = "w") as triples_file:
  for element in triples:
    triples_file.write(element[0] + "\t" + element[1] + "\t" + element[2] + "\n")

In [30]:
# import networkx as nx

In [31]:
# dg = nx.DiGraph()
# dg = nx.from_pandas_edgelist(df, source = 'Head_noun_lemmatised', target = 'Tail_noun_lemmatised', edge_attr = 'Common_adjectives')

In [32]:
# len(dg.nodes())

In [33]:
noun_set = set()
for i in df.Head_noun_lemmatised:
  noun_set.add(i)
for i in df.Tail_noun_lemmatised:
  noun_set.add(i)
print(len(noun_set))

787


In [34]:
import pandas as pd
df_sonnets = pd.read_csv("Sonnets.txt", sep = "@", names = ["LineText"])
df_sonnets = df_sonnets[df_sonnets['LineText'].str.len() > 8]
df_sonnets['LineText'] = df_sonnets['LineText'].str.lower()
df_sonnets['LineText'] = df_sonnets['LineText'].str.replace('[^a-z]', ' ', regex = True)

In [35]:
def get_set_from_list(l):
  list_set = set()
  for i in l:
    list_set.add(i)
  return list_set

def intersection_words_string(intersection_words):
  s = ""
  if not intersection_words:
    return
  for word in intersection_words:
    s = s + word + "|"
  if s[-1] == '|':
    s = s[0:len(s) - 1]
  return s


df_sonnets['LineTextSplit'] = df_sonnets['LineText'].str.split()
df_sonnets['LineTextSet'] = df_sonnets.apply(lambda x : get_set_from_list(x.LineTextSplit), axis = 1) 
df_sonnets['Intersection_Words'] = df_sonnets.apply(lambda x : noun_set.intersection(x.LineTextSet), axis = 1)
df_sonnets['Intersection_count'] = df_sonnets.apply(lambda x : len(x.Intersection_Words), axis = 1)
df_sonnets = df_sonnets[df_sonnets['Intersection_count'] > 1]
df_sonnets['Intersection_Words'] = df_sonnets.apply(lambda x : intersection_words_string(x.Intersection_Words), axis = 1)
df_sonnets = df_sonnets.drop(columns = ['LineTextSplit', 'LineTextSet'])
df_sonnets.to_csv('Sonnet_pairs.csv')
df_sonnets

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sonnets['Intersection_Words'] = df_sonnets.apply(lambda x : intersection_words_string(x.Intersection_Words), axis = 1)


Unnamed: 0,LineText,Intersection_Words,Intersection_count
1,from fairest creatures we desire increase,desire|increase,2
6,feed st thy light s flame with self substantia...,feed|light,2
9,thou that art now the world s fresh ornament,art|world,2
25,if thou couldst answer this fair child of mine,answer|child,2
29,and see thy blood warm when thou feel st i...,cold|blood|feel,3
...,...,...,...
2285,which borrow d from this holy fire of love,fire|love,2
2289,but at my mistress eye love s brand new fired,eye|love,2
2290,the boy for trial needs would touch my breast,trial|breast|boy|touch,4
2305,which from love s fire took heat perpetual,fire|love,2


In [36]:
# dg.adj["science"]

In [37]:
# nx.__version__

In [38]:
# # !pip install --upgrade scipy networkx
# ! pip install 'scipy>=1.8'

In [39]:
# # nx.draw_networkx(dg, node_size = 60, font_size = 8)
# import matplotlib.pyplot as plt
# # plt.figure(figsize = (18, 18), dpi = 500)
# plt.figure(figsize = (18, 18))
# nx.draw_networkx(dg, pos = nx.spring_layout(dg), node_size = 10, font_size = 2)
# plt.savefig("plot.png")
# nx.draw(dg)
# #plt.savefig("graph.pdf")