In [None]:
!pip install spacy-experimental
!pip install https://github.com/explosion/spacy-experimental/releases/download/v0.6.0/en_coreference_web_trf-3.4.0a0-py3-none-any.whl#egg=en_coreference_web_trf


In [None]:
# !pip install spacy==3.4.4

In [None]:
# !pip install spacy-transformers

In [None]:
# !pip show spacy

In [None]:
import spacy
import spacy_experimental
# import spacy_transformers
import re
import pandas as pd

In [None]:
# nlp = spacy.load("en_coreference_web_trf")


In [None]:
# df = pd.read_excel(r"/content/past_one_month.xlsx")

In [None]:
# text = df.loc[0, 'full_content']

### This coref resolution technique resolves coreference pertaining to PERSON entity in a text. Token limit in text should not exceed 512 

In [76]:
class coref_resolution:
  def __init__(self,text):
    self.text = text
  
  def get_coref_clusters(self,):
    """This method produces coref clusters"""
    self.nlp = spacy.load("en_core_web_trf")
    nlp_coref = spacy.load("en_coreference_web_trf")

    # use replace_listeners for the coref components
    nlp_coref.replace_listeners("transformer", "coref", ["model.tok2vec"])
    nlp_coref.replace_listeners("transformer", "span_resolver", ["model.tok2vec"])

    # we won't copy over the span cleaner
    self.nlp.add_pipe("coref", source=nlp_coref)
    self.nlp.add_pipe("span_resolver", source=nlp_coref)

    self.doc = self.nlp(self.text)
    self.tokens = [str(token) for token in self.doc]
    coref_clusters = {key : val for key , val in self.doc.spans.items() if re.match(r"coref_clusters_*",key)}

    return coref_clusters
  
  def find_span_start_end(self,coref_clusters):
    """This method finds start and end span of entire text piece in every cluster"""
    cluster_w_spans = {}
    for cluster in coref_clusters:
      cluster_w_spans[cluster] = [(span.start, span.end, span.text) for span in coref_clusters[cluster]]
    
    return cluster_w_spans
  
  def find_person_start_end(self, coref_clusters,cluster_w_spans):
    """this function finds the start and end span of PERSON in every element of every cluster"""
    # nlp = spacy.load("en_core_web_trf")
    coref_clusters_with_name_spans = {}
    for key, val in coref_clusters.items():
      temp = [0 for i in range(len(val))]
      person_flag = False
      for idx, text in enumerate(val):
        doc = self.nlp(str(text))
        for word in doc.ents:
          if word.label_ == 'PERSON':
            temp[idx] = (word.start, word.end, word.text)
            person_flag = True
        for token in doc:
          if token.pos_ == 'PRON':
            temp[idx] = (token.i,token.i+1,token)
      if len(temp) > 0:
        if person_flag:
          orig = cluster_w_spans[key]
          for idx, tup in enumerate(orig):
            if isinstance(tup, tuple) and isinstance(temp[idx], tuple):
              orig_start, orig_end, text = tup
              offset_start, offset_end, _ = temp[idx]
              orig_start += offset_start
              orig_end = orig_start + (offset_end - offset_start) 
              orig[idx] = (orig_start, orig_end, text)
          coref_clusters_with_name_spans[key] = orig

    return coref_clusters_with_name_spans
  
  def replace_refs_w_names(self,coref_clusters_with_name_spans):
    """This function replaces name references and pronouns by actual name of the person"""
    tokens = self.tokens
    special_tokens = ["my","his","her","mine"]
    for key, val in coref_clusters_with_name_spans.items():
      if len(val) > 0 and isinstance(val, list):
        head = val[0]
        head_start, head_end, _ = head
        head_name = " ".join(tokens[head_start:head_end])
        for i in range(1,len(val)):
          coref_token_start, coref_token_end, _ = val[i]
          count = 0
          for j in range(coref_token_start, coref_token_end):
            if tokens[j].upper() == "I":
                count += 1
                continue
            if count == 0:
              if tokens[j].lower() in special_tokens:
                if head_name[-1].lower() == "s":
                  tokens[j] = str(head_name)+"'"
                else:
                  tokens[j] = str(head_name)+"'s"
              else:
                tokens[j] = head_name
            else:
              tokens[j] = ""
            count += 1

    return tokens
  
  def main(self,):
    """combines all the steps and returns the coreferenced text"""
    coref_clusters = self.get_coref_clusters()
    coref_w_spans = self.find_span_start_end(coref_clusters)
    coref_clusters_with_name_spans = self.find_person_start_end(coref_clusters,coref_w_spans)
    tokens = self.replace_refs_w_names(coref_clusters_with_name_spans)

    return " ".join(tokens)




  

  

In [None]:
del obj

In [77]:
text = """As Elizabeth Holmes prepares to report to prison next week, the criminal case that laid bare the blood-testing scam at the heart of her Theranos startup is entering its final phase. The 11-year sentence represents a comeuppance for the wide-eyed woman who broke through "tech bro" culture to become one of Silicon Valley 's most celebrated entrepreneurs, only to be exposed as a fraud. Along the way, Holmes became a symbol of the shameless hyperbole that often saturates startup culture. But questions still linger about her true intentions - so many that even the federal judge who presided over her trial seemed mystified. And Holmes' defenders continue to ask whether the punishment fits the crime. At 39, she seems most likely to be remembered as Silicon Valley's Icarus - a high-flying entrepreneur burning with reckless ambition whose odyssey culminated in convictions for fraud and conspiracy. Her motives are still somewhat mysterious, and some supporters say federal prosecutors targeted her unfairly in their zeal to bring down one of the most prominent practitioners of fake-it-til-you-make-it - the tech sector's brand of self-promotion that sometimes veers into exaggeration and blatant lies to raise money. Holmes will begin to pay the price for her deceit on May 30 when she is scheduled to begin the sentence that will separate her from her two children - a son whose July 2021 birth delayed the start of her trial and a 3-month-old daughter conceived after her conviction. Discover the stories of your interest Blockchain 5 Stories Cyber-safety 7 Stories Fintech 9 Stories E-comm 9 Stories ML 8 Stories Edtech 6 Stories She is expected to be incarcerated in Bryan, Texas, about 100 miles (160 km) northwest of her hometown of Houston. The prison was recommended by the judge who sentenced Holmes, but authorities have not publicly disclosed where she will be held. Her many detractors contend she deserves to be in prison for peddling a technology that she repeatedly boasted would quickly scan for hundreds of diseases and other health problems with a few drops of blood taken with a finger prick. The technology never worked as promised. Instead, Theranos tests produced wildly unreliable results that could have endangered patients' lives - one of the most frequently cited reasons why she deserved to be prosecuted. Before those lies were uncovered in a series of explosive articles in The Wall Street Journal beginning in October 2015, Holmes raised nearly $1 billion from a list of savvy investors including Oracle co-founder Larry Ellison and media mogul Rupert Murdoch. It was the duping of those investors that led to her prison sentence and a $452 million restitution bill. Holmes' stake in Theranos at one point catapulted her paper wealth to $4.5 billion. She never sold any of her stock in the company, though trial evidence left no doubt she reveled in the trappings of fame and fortune - so much so that she and the father of her children, William "Billy" Evans, lived on a palatial Silicon Valley estate during the trial. The theory that Holmes was running an elaborate scam was buttressed by trial evidence documenting her efforts to prevent the Journal's investigation from being published. That campaign compelled John Carreyrou - the reporter responsible for those bombshell stories - to attend court and position himself in Holmes' line of vision when she took the witness stand. Holmes also signed off on surveillance aimed at intimidating Theranos employees who helped uncover the flaws with the blood-testing technology. The whistleblowers included Tyler Shultz, the grandson of former Secretary of State George Shultz, whom Holmes befriended and persuaded to join the Theranos board. Tyler Shultz became so unnerved by Holmes' efforts to shut him up that he began sleeping with a knife under his pillow, according to a wrenching statement delivered by his father, Alex, at her sentencing. Holmes' supporters still contend she always had good intentions and was unfairly scapegoated by the Justice Department. They insist she simply deployed the same over-the-top promotion tactics as many other tech executives, including Elon Musk, who has repeatedly made misleading statements about the capabilities of Tesla's self-driving cars. According to those supporters, Holmes was singled out because she was a woman who briefly eclipsed the men who customarily bask in Silicon Valley's spotlight, and the trial turned her into a latter-day version of Hester Prynne - the protagonist in the 1850 novel "The Scarlet Letter." Holmes steadfastly maintained her innocence during seven often-riveting days of testimony in her own defense - a spectacle that caused people to line up shortly after midnight to secure one of the few dozen seats available in the San Jose courtroom. On one memorable day, Holmes recounted how she had never gotten over the trauma of being raped while enrolled at Stanford University. She then described being subjected to a long-running pattern of emotional and sexual abuse by her former lover and Theranos conspirator, Ramesh "Sunny" Balwani, and suggested his stifling control blurred her thinking. Balwani's lawyer, Jeffrey Coopersmith, denied those allegations during the trial. In Balwani's subsequent trial, Coopersmith unsuccessfully tried to depict his client as Holmes' pawn. Balwani, 57, is now serving a nearly 13-year prison sentence for fraud and conspiracy. When it came time to sentence the then-pregnant Holmes in November, US District Judge Edward Davila seemed as puzzled as anyone about why she did what she did. "This is a fraud case where an exciting venture went forward with great expectations and hope, only to be dashed by untruth, misrepresentations, hubris and plain lies," Davila lamented while Holmes stood before him. "I suppose we step back and we look at this, and we think what is the pathology of fraud?" The judge also hearkened back to the days that Silicon Valley consisted mostly of orchards farmed by immigrants. That was before the land was ceded to the tech boom beginning in 1939 when William Hewlett and David Packard founded a company bearing their surnames in a one-car garage in Palo Alto - the same city where Theranos was based. "You'll recall the wonderful innovation of those two individuals in that small garage," Davila reminded everyone in the rapt courtroom. "No exotic automobiles or lavish lifestyle, just a desire to create for society's"""

In [78]:
obj = coref_resolution(text)
refined = obj.main()

In [79]:
refined

'As Elizabeth Holmes prepares to report to prison next week , the criminal case that laid bare the blood - testing scam at the heart of Elizabeth Holmes\' Theranos startup is entering its final phase . The 11 - year sentence represents a comeuppance for the wide - eyed woman Elizabeth Holmes broke through " tech bro " culture to become one of Silicon Valley \'s most celebrated entrepreneurs , only to be exposed as a fraud . Along the way , Elizabeth Holmes became a symbol of the shameless hyperbole that often saturates startup culture . But questions still linger about Elizabeth Holmes\' true intentions - so many that even the federal judge who presided over Elizabeth Holmes\' trial seemed mystified . And Elizabeth Holmes \' defenders continue to ask whether the punishment fits the crime . At 39 , Elizabeth Holmes seems most likely to be remembered as Silicon Valley \'s Icarus - a high - flying entrepreneur burning with reckless ambition whose odyssey culminated in convictions for frau

In [None]:
df.shape

(491, 12)

In [None]:
from tqdm import tqdm
for idx, row in tqdm(df.iterrows()):
  obj = coref_resolution(row['full_content'])
  df.loc[idx, 'coref_resolved_content'] = obj.main()
  del obj 

15it [05:45, 24.42s/it]Token indices sequence length is longer than the specified maximum sequence length for this model (649 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (649 > 512). Running this sequence through the model will result in indexing errors
81it [27:06, 21.02s/it]Token indices sequence length is longer than the specified maximum sequence length for this model (655 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (655 > 512). Running this sequence through the model will result in indexing errors
455it [2:39:20, 16.23s/it]Token indices sequence length is longer than the specified maximum sequence length for this model (518 > 512). Running this sequence through the model will result in indexing errors
Token indices sequ