"""
One of the most common tasks in NLP is predicting named entities, like people, locations, countries, brands, etc.

Performing NER is ridiculously easy in spaCy. After processing a text, just extract the ents attribute of the doc object:
"""

In [1]:
#pip install spacy
import numpy as np

In [2]:
#!python -m spacy download en_core_web_lg 

In [3]:
import spacy
import pandas as pd

print()




In [4]:
nlp = spacy.load("en_core_web_lg")
sentences_columns = ['sentence_id', 'sentence']
named_ent_columns= ['sentence_id', 'Text', 'Label'] 


In [5]:
sentences_table = pd.DataFrame(columns = ['sentence_id', 'sentece'])
named_entity_tb = pd.DataFrame(columns = ['sentence_id', 'Text', 'Label'] )

In [6]:
#load book
with open('of_mice_new.txt', 'r', encoding='utf-8') as f:
     txt = f.read()

In [7]:
#conver to doc
doc = nlp(txt)

In [8]:
#     for token in sent:
#         print(token," : ",token.pos_)

In [9]:
id = 0;
list_senteces = []
for sent in doc.sents:
    list_senteces.append(sent)
    for e in sent.ents:
          new_row  = {named_ent_columns[0] : id, named_ent_columns[1] : e.text,  named_ent_columns[2] : e.label_ }
          named_entity_tb = named_entity_tb.append( new_row, ignore_index=True)   
         
    id+=1    

In [10]:
named_entity_tb

Unnamed: 0,sentence_id,Text,Label
0,0,1937,DATE
1,1,John Steinbeck,PERSON
2,2,Compass Books Edition,ORG
3,2,1963,DATE
4,2,"The Viking Press, Inc.",ORG
...,...,...,...
1419,3739,George,PERSON
1420,3740,Curley,PERSON
1421,3740,Carlson,PERSON
1422,3741,Carlson,PERSON


In [58]:
labels = named_entity_tb['Label'].unique()
list_senteces[3741]

And Carlson said, “Now what the
hell ya suppose is eatin’ them two guys?”



                                THE END

In [12]:
for ent in labels :
    print(ent,":",spacy.explain(ent)) 

DATE : Absolute or relative dates or periods
PERSON : People, including fictional
ORG : Companies, agencies, institutions, etc.
CARDINAL : Numerals that do not fall under another type
FAC : Buildings, airports, highways, bridges, etc.
GPE : Countries, cities, states
QUANTITY : Measurements, as of weight or distance
LOC : Non-GPE locations, mountain ranges, bodies of water
ORDINAL : "first", "second", etc.
TIME : Times smaller than a day
WORK_OF_ART : Titles of books, songs, etc.
MONEY : Monetary values, including unit
NORP : Nationalities or religious or political groups
PRODUCT : Objects, vehicles, foods, etc. (not services)


FILTER BY PERSON

In [13]:
named_entity_tb

Unnamed: 0,sentence_id,Text,Label
0,0,1937,DATE
1,1,John Steinbeck,PERSON
2,2,Compass Books Edition,ORG
3,2,1963,DATE
4,2,"The Viking Press, Inc.",ORG
...,...,...,...
1419,3739,George,PERSON
1420,3740,Curley,PERSON
1421,3740,Carlson,PERSON
1422,3741,Carlson,PERSON


In [14]:
persons_entity = named_entity_tb[named_entity_tb['Label']=='PERSON']


In [15]:
counts_appeared = persons_entity['Text'].value_counts()

In [16]:
counts_appeared

George           363
Lennie           311
Curley            96
Carlson           46
Slim              44
                ... 
S’pose Curley      1
Goodnight          1
George coldly      1
Soledad            1
Andy               1
Name: Text, Length: 66, dtype: int64

In [17]:
len(counts_appeared)

66

In [18]:
counts_appeared.index

Index(['George', 'Lennie', 'Curley', 'Carlson', 'Slim', 'Whit', 'Candy', 'Jus',
       'Susy', 'Clara', 'Bill', 'Nobody’d', 'Aunt Clara', 'Tha', 'Murray',
       'Le', 'Buck', 'Awright', 'Purty', 'George Milton', 'Gabilan',
       'Lennie Small', 'Guy', 'Janson', 'O.K.', 'Milton', 'Carl', 'li’ble',
       'outa\nWeed', 'jack jus', 'Howard', 'jack', 'Lulu', 'S’pose George',
       'George wun’t', 'awmighty', 'gon’ta', 'George—George—George',
       'Peter Rand', 'Wha', 'Bust', 'little euchre', 'Bill Tenner', 'Sta',
       'Al Wilts', 'William Tenner', 'God awmighty', 'Jackson', 'Take Curley',
       'fren', 'Whitey', 'Andy Cushman', 'jack-pin', 's’pose George', 'Coulda',
       'Noiselessly Lennie', 'sta', 'George knelt', 'Baloney',
       'John Steinbeck', 'George unslung', 'S’pose Curley', 'Goodnight',
       'George coldly', 'Soledad', 'Andy'],
      dtype='object')

In [19]:
for person in counts_appeared.index:
    a = persons_entity[persons_entity['Text'] == person]
    print(a['sentence_id'].value_counts())

73      2
547     2
245     2
1934    2
752     2
       ..
679     1
2598    1
2158    1
1699    1
3710    1
Name: sentence_id, Length: 358, dtype: int64
856     2
868     2
2904    2
1003    2
1041    2
       ..
164     1
3515    1
2719    1
670     1
397     1
Name: sentence_id, Length: 306, dtype: int64
2960    2
3393    2
2737    1
3423    1
861     1
       ..
1831    1
1830    1
2213    1
3343    1
895     1
Name: sentence_id, Length: 94, dtype: int64
1600    1
1200    1
1825    1
3637    1
1637    1
1510    1
1831    1
2121    1
3417    1
3419    1
2223    1
1205    1
1219    1
3741    1
2125    1
1653    1
1520    1
3740    1
2107    1
3355    1
2195    1
3390    1
1823    1
1182    1
1821    1
2187    1
1476    1
3397    1
3335    1
1225    1
2123    1
1867    1
1485    1
1870    1
1616    1
1535    1
1502    1
3717    1
2197    1
1603    1
3415    1
1830    1
1177    1
1626    1
923     1
1471    1
Name: sentence_id, dtype: int64
1522    2
1642    2
1295    1
1058    1
2545

In [20]:
for person in counts_appeared.index:
    persons_entity.loc[persons_entity['Text'] == person, 'number_matches_in_text'] = counts_appeared[person]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [21]:
persons_entity

Unnamed: 0,sentence_id,Text,Label,number_matches_in_text
1,1,John Steinbeck,PERSON,1.0
12,2,Janson,PERSON,1.0
28,32,Lennie,PERSON,311.0
29,33,Lennie,PERSON,311.0
30,35,Lennie,PERSON,311.0
...,...,...,...,...
1418,3736,George,PERSON,363.0
1419,3739,George,PERSON,363.0
1420,3740,Curley,PERSON,96.0
1421,3740,Carlson,PERSON,46.0


In [22]:
persons_entity

Unnamed: 0,sentence_id,Text,Label,number_matches_in_text
1,1,John Steinbeck,PERSON,1.0
12,2,Janson,PERSON,1.0
28,32,Lennie,PERSON,311.0
29,33,Lennie,PERSON,311.0
30,35,Lennie,PERSON,311.0
...,...,...,...,...
1418,3736,George,PERSON,363.0
1419,3739,George,PERSON,363.0
1420,3740,Curley,PERSON,96.0
1421,3740,Carlson,PERSON,46.0


In [56]:
k = persons_entity.groupby('sentence_id').count()
sentences_with_many_persons = k[k['Text'] > 2].index
sentences_with_many_persons

Int64Index([547, 1112, 1174, 1830, 1870, 2107, 3335], dtype='int64', name='sentence_id')

In [24]:

k

Unnamed: 0_level_0,Text,Label,number_matches_in_text
sentence_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,1,1
2,1,1,1
32,1,1,1
33,1,1,1
35,1,1,1
...,...,...,...
3730,1,1,1
3736,2,2,2
3739,1,1,1
3740,2,2,2


In [32]:
k = persons_entity[persons_entity['sentence_id'] == 3736]

   

    

In [42]:
persons_entity[persons_entity['sentence_id'] == 100]

Unnamed: 0,sentence_id,Text,Label,number_matches_in_text,combination_with_other_caracters
59,100,Murray,PERSON,3.0,
60,100,George,PERSON,363.0,Slim


In [52]:
for sentence_id in sentences_with_many_persons:
    k = persons_entity[persons_entity['sentence_id'] == sentence_id]
    for i in k.Text:
        combination_with_other_caracters = list(k['Text'])
        combination_with_other_caracters.remove(i)
        d = persons_entity.loc[(persons_entity['sentence_id']== sentence_id)
                               & (persons_entity['Text']==i),['combination_with_other_caracters']] = str(combination_with_other_caracters)
    


In [53]:
persons_entity

Unnamed: 0,sentence_id,Text,Label,number_matches_in_text,combination_with_other_caracters
1,1,John Steinbeck,PERSON,1.0,
12,2,Janson,PERSON,1.0,
28,32,Lennie,PERSON,311.0,
29,33,Lennie,PERSON,311.0,
30,35,Lennie,PERSON,311.0,
...,...,...,...,...,...
1418,3736,George,PERSON,363.0,['Slim']
1419,3739,George,PERSON,363.0,
1420,3740,Curley,PERSON,96.0,['Carlson']
1421,3740,Carlson,PERSON,46.0,['Curley']


In [57]:
persons_entity[persons_entity['sentence_id'] == 3335]

Unnamed: 0,sentence_id,Text,Label,number_matches_in_text,combination_with_other_caracters
1249,3335,Carlson,PERSON,46.0,"['Whit', 'Curley']"
1250,3335,Whit,PERSON,26.0,"['Carlson', 'Curley']"
1251,3335,Curley,PERSON,96.0,"['Carlson', 'Whit']"


In [59]:
counts_appeared

George           363
Lennie           311
Curley            96
Carlson           46
Slim              44
                ... 
S’pose Curley      1
Goodnight          1
George coldly      1
Soledad            1
Andy               1
Name: Text, Length: 66, dtype: int64

In [64]:
len(counts_appeared)
mean = counts_appeared.mean()
std = counts_appeared.std()

mean + std

74.35751401873966

In [81]:
def calculate_rank(x,mean,std):
    if x >= mean + std :
        return 1
    elif x >= mean:
        return 2
    else:
        return 3

In [82]:
persons_entity['rank'] = persons_entity['number_matches_in_text'].apply(lambda x : calculate_rank(x,mean= counts_appeared.mean(),std=counts_appeared.std() ))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  persons_entity['rank'] = persons_entity['number_matches_in_text'].apply(lambda x : calculate_rank(x,mean= counts_appeared.mean(),std=counts_appeared.std() ))


In [89]:
persons_entity[persons_entity['rank']==1]['Text'].unique()

array(['Lennie', 'George', 'Curley'], dtype=object)

referecnes:
    https://home.aveek.io/blog/post/finding-main-characters/