# Frequency analysis and sentiment with NLTK

find_word_freq_readfiles.ipynb

### 1 ###
•	It reads the dataframe with the analysis (file_df_analysis) and the dataframe with the histogram (file_df_analysis_hist) information created by “analyse_papers.ipynb”
•	Frequency analysis of the sentence were a paper was cited. It finds a representative sentence showing how the a paper has been cited in the different sections. It is selected by getting the list of sentences containing the maximum number of most frequent words used when citing the paper (in each section). If there are several sentences it picks the shortest one.

### 2 ###
•	Sentiment analysis using NLTK.



##############################################################


### 1 ###
•	It reads the dataframe with the analysis (file_df_analysis) and the dataframe with the histogram (file_df_analysis_hist) information created by “analyse_papers.ipynb”
•	Frequency analysis of the sentence were a paper was cited. It finds a representative sentence showing how the a paper has been cited in the different sections. It is selected by getting the list of sentences containing the maximum number of most frequent words used when citing the paper (in each section). If there are several sentences it picks the shortest one.


In [1]:
import sys
import os
import re
from copy import deepcopy
import numpy as np
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt

sys.path.insert(0, "./modules")
import words_frec_analysis_get_sentence



# Configuration

In [2]:
data_path = 'data'

ds_name = 'DOI_cited_science_1179052_retracted'

analysis_path = os.path.join(data_path, 'analysis')

# In
citing_sections_tsv = os.path.join(analysis_path, '%s_sections.tsv' % ds_name)
analysis_hist_tsv = os.path.join(analysis_path, '%s_hist_v2.tsv' % ds_name)

# Setup

In [3]:
# One time download
nltk.download("stopwords")
nltk.download('vader_lexicon')

[nltk_data] Downloading package stopwords to /home/deuser/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/deuser/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [4]:
df = pd.read_csv(citing_sections_tsv, sep='\t', encoding='utf-8')
df.head()

Unnamed: 0,cited_DOI,cited_in_conclusions,cited_in_discussion,cited_in_introduction,cited_in_maintext,citing_DOI,conclusions_found,discussion_found,introduction_found,maintext_found,reference_number,sentence_citing_conclusions,sentence_citing_discussion,sentence_citing_intro,sentence_citing_maintext
0,10.1126/science.1179052,False,False,False,False,DOI not found,False,False,False,True,B2,,,,
1,10.1126/science.1179052,False,False,True,True,10.1371/journal.pone.0027870,False,True,True,True,pone.0027870-Lombardi1,,,"['In 2009, Lombardi<italic>et al.</italic>repo...",['For amplification of XMRV/MLV<italic>gag</it...


In [5]:
df_hist = pd.read_csv(analysis_hist_tsv, sep='\t', encoding='utf-8')
df_hist.head()

Unnamed: 0,DOI_cited,cited_in_conlusions,cited_in_discussion,cited_in_introduction,cited_in_maintext,papers_all_sections_found,total_papers_citing
0,10.1126/science.1179052,0,0,1.0,1.0,1,2


In [6]:
df[df['cited_DOI'] == df_hist.iloc[0].DOI_cited]

Unnamed: 0,cited_DOI,cited_in_conclusions,cited_in_discussion,cited_in_introduction,cited_in_maintext,citing_DOI,conclusions_found,discussion_found,introduction_found,maintext_found,reference_number,sentence_citing_conclusions,sentence_citing_discussion,sentence_citing_intro,sentence_citing_maintext
0,10.1126/science.1179052,False,False,False,False,DOI not found,False,False,False,True,B2,,,,
1,10.1126/science.1179052,False,False,True,True,10.1371/journal.pone.0027870,False,True,True,True,pone.0027870-Lombardi1,,,"['In 2009, Lombardi<italic>et al.</italic>repo...",['For amplification of XMRV/MLV<italic>gag</it...


In [7]:
df_filtered = df['sentence_citing_maintext'].dropna()

df_filtered

1    ['For amplification of XMRV/MLV<italic>gag</it...
Name: sentence_citing_maintext, dtype: object

In [8]:
was_intro_found = df['introduction_found'] == True
df_filtered = df[was_intro_found]
print(df_filtered.shape)
df_filtered

(1, 15)


Unnamed: 0,cited_DOI,cited_in_conclusions,cited_in_discussion,cited_in_introduction,cited_in_maintext,citing_DOI,conclusions_found,discussion_found,introduction_found,maintext_found,reference_number,sentence_citing_conclusions,sentence_citing_discussion,sentence_citing_intro,sentence_citing_maintext
1,10.1126/science.1179052,False,False,True,True,10.1371/journal.pone.0027870,False,True,True,True,pone.0027870-Lombardi1,,,"['In 2009, Lombardi<italic>et al.</italic>repo...",['For amplification of XMRV/MLV<italic>gag</it...


In [9]:
df_filtered.drop('sentence_citing_intro', axis=1, inplace=False).describe()

Unnamed: 0,sentence_citing_conclusions,sentence_citing_discussion
count,0.0,0.0
mean,,
std,,
min,,
25%,,
50%,,
75%,,
max,,


#### Let's find the most frequente words in an example of a representative sentence for each section (intro, discussion, etc) where the paper is cited.

In [10]:
df = pd.read_csv(citing_sections_tsv, sep='\t', encoding='utf-8')
#  df['cited_DOI', 'cited_in_conclusions','cited_in_discussion',
#    'cited_in_introduction', 'cited_in_maintext', 'citing_DOI',
#    'conclusions_found', 'discussion_found', 'introduction_found',
#    'maintext_found', 'reference_id', 'sentence_citing_conclusions',
#    'sentence_citing_discussion', 'sentence_citing_intro', 'sentence_citing_maintext']


print('cited_DOI: ', df.cited_DOI.unique())


print("\n------------- ANALYSIS INTRODUCTION -----------")
sentences_intro = df.sentence_citing_intro.dropna()
if not sentences_intro.empty:
    words_frec_analysis_get_sentence.analysis(sentences_intro)
else:
    print("No sentences to analyse")

print("\n------------- ANALYSIS MAINTEXT -----------")
sentences_maintext = df.sentence_citing_maintext.dropna()
if not sentences_maintext.empty:
    words_frec_analysis_get_sentence.analysis(sentences_maintext)
else:
    print("No sentences to analyse")

print("\n------------- ANALYSIS DISCUSSION -----------")
sentences_discussion = df.sentence_citing_discussion.dropna()
if not sentences_discussion.empty:
    words_frec_analysis_get_sentence.analysis(sentences_discussion)
else:
    print("No sentences to analyse")
    
    
print("\n ------------- ANALYSIS CONCLUSIONS -----------")
#number_features = 5
sentences_conclusions = df.sentence_citing_conclusions.dropna()
if not sentences_conclusions.empty:
    words_frec_analysis_get_sentence.analysis(sentences_conclusions)
else:
    print("No sentences to analyse")    


cited_DOI:  ['10.1126/science.1179052']

------------- ANALYSIS INTRODUCTION -----------
1    reported dection XMRV peripherblood mononuclea...
Name: sentence_citing_intro, dtype: object
['cells', 'cfs', 'cohort', 'compared', 'controls']
Frequent words included:  [5]
Maximum words in the same sentence  =  5
1,"['In 2009, Lombardi<italic>et al.</italic>reported the detection of XMRV in both peripheral blood mononuclear cells (PBMC) and plasma of 67% of a CFS patient cohort compared to 3.7% in healthy controls<xref ref-type=""bibr"" rid=""pone.0027870-Lombardi1"">[5]</xref>.']"

------------- ANALYSIS MAINTEXT -----------
1    gagsequences, µl transcribed cDNA used first r...
Name: sentence_citing_maintext, dtype: object
['amplification', 'round', 'transcribed', 'usb', 'used']
Frequent words included:  [5]
Maximum words in the same sentence  =  5
1,"['For amplification of XMRV/MLV<italic>gag</italic>sequences, 5 µl of the transcribed cDNA were used for the first round of amplification wi

### 2 ###
•	Sentiment analysis using NLTK.

In [12]:
stopwords = nltk.corpus.stopwords

def filter_sentence(citing_sentence):
    if citing_sentence == None:
        return " " #filtered_sentences_noNone.append(" ")
    
    if citing_sentence != None:
        citing_sentence = re.sub("[\<\[].*?[\>\]]", "", citing_sentence) #to remove citations
        #citing_sentence = re.sub("[*?]", "", citing_sentence) #to remove citations
        citing_sentence = re.sub('[0-9]+', '', citing_sentence)
        delete = ["Introduction", "Background", "Conclusions","the", "and", "therefore", "thus"]#, "\n", "<\sub>", "bibr", "ref", "rid", "type", "xref"] #, "/p\np\n", "\p"]
        for word in delete:
            citing_sentence = re.sub(word, "", citing_sentence) 
        #citing_sentence = re.sub("\?", "", citing_sentence) #to remove citations
        citing_sentence = ' '.join([word for word in citing_sentence.split() if word not in (stopwords.words('english'))])
        return citing_sentence # O ?????????filtered_sentences_noNone.append(sentence[0])

def fit_transform_or_empty(vectorizer, data):
    return count_vectorizer.fit_transform(data) if len(data) > 0 else csr_matrix([])

#file_df = '/project/elife/data/analysis/df_1000_1000v2_prep_.csv'
df = pd.read_csv(citing_sections_tsv, sep='\t', encoding='utf-8')

#  df['cited_DOI', 'cited_in_conclusions','cited_in_discussion',
#    'cited_in_introduction', 'cited_in_maintext', 'citing_DOI',
#    'conclusions_found', 'discussion_found', 'introduction_found',
#    'maintext_found', 'reference_id', 'sentence_citing_conclusions',
#    'sentence_citing_discussion', 'sentence_citing_intro', 'sentence_citing_maintext']
    


count_vectorizer = CountVectorizer(max_features=5)
#sentence = ''.join(str(string) for string in citing_sentences_list)
#sentence = sentence.decode('utf-8')
#u_sentence = unicode( sentence, "utf-8" )
#backToBytes = u_sentence.encode( "utf-8" )



#sentence = re.sub(r',([0-9])', '\\1', sentence)
# sort out HMTL formatting of &
#sentence = re.sub(r'&amp', 'and', sentence)

citing_sentences_intro_list = df.sentence_citing_intro.dropna()
citing_sentences_maintext_list = df.sentence_citing_maintext.dropna()
citing_sentences_discussion_list = df.sentence_citing_discussion.dropna()
citing_sentences_conclusions_list = df.sentence_citing_conclusions.dropna()
    
citing_sentences_original_intro = deepcopy(citing_sentences_intro_list)
citing_sentences_original_maintext = deepcopy(citing_sentences_maintext_list)
citing_sentences_original_discussion = deepcopy(citing_sentences_discussion_list)
citing_sentences_original_conclusions = deepcopy(citing_sentences_conclusions_list) # This is a list of lists, so you need deepcopy

filtered_sentences_intro_list = citing_sentences_intro_list.apply(filter_sentence)
filtered_sentences_maintext_list = citing_sentences_maintext_list.apply(filter_sentence)
filtered_sentences_discussion_list = citing_sentences_discussion_list.apply(filter_sentence)
filtered_sentences_conclusions_list = citing_sentences_conclusions_list.apply(filter_sentence)        

#filtered_sentences_intro_list = [filter_sentence(sentence) for sentence in citing_sentences_intro_list]
#filtered_sentences_maintext_list = [filter_sentence(sentence) for sentence in citing_sentences_maintext_list]
#filtered_sentences_discussion_list = [filter_sentence(sentence) for sentence in citing_sentences_discussion_list]
#filtered_sentences_conclusions_list = [filter_sentence(sentence) for sentence in citing_sentences_conclusions_list]

print(filtered_sentences_intro_list)
#citing_sentences_intro_list.apply(filter_sentence)

count_vectors_intro = fit_transform_or_empty(count_vectorizer, filtered_sentences_intro_list)
#print(count_vectorizer.get_feature_names())
#word_frequency_intro = count_vectors_intro.toarray()

count_vectors_maintext = fit_transform_or_empty(count_vectorizer, filtered_sentences_maintext_list)
#print(count_vectorizer.get_feature_names())
#word_frequency_maintext = count_vectors_maintext.toarray()

count_vectors_discussion = fit_transform_or_empty(count_vectorizer, filtered_sentences_discussion_list)
#print(count_vectorizer.get_feature_names())
#word_frequency_discussion = count_vectors_discussion.toarray()

count_vectors_conclusions = fit_transform_or_empty(count_vectorizer, filtered_sentences_conclusions_list)
#print(count_vectorizer.get_feature_names())
#word_frequency_conclusions = count_vectors_conclusions.toarray()


#print(word_frequency_intro)
#print(word_frequency_maintext)
#print(word_frequency_discussion)
#print(word_frequency_conclusions)

number_words_in_sentence_intro = np.sum(count_vectors_intro.toarray(),axis=1).tolist()
number_words_in_sentence_maintext = np.sum(count_vectors_maintext.toarray(),axis=1).tolist()
number_words_in_sentence_discussion = np.sum(count_vectors_discussion.toarray(),axis=1).tolist()
number_words_in_sentence_conclusions = np.sum(count_vectors_conclusions.toarray(),axis=1).tolist()



print("Frequent words included (intro): ", number_words_in_sentence_intro)
print("Frequent words included (maintext): ", number_words_in_sentence_maintext)
print("Frequent words included (discussion): ", number_words_in_sentence_discussion)
print("Frequent words included (conclusions): ", number_words_in_sentence_conclusions)

print("maximum words in the same sentence in introduction =", max(number_words_in_sentence_intro))
print("maximum words in the same sentence in maintext =", max(number_words_in_sentence_maintext))
print("maximum words in the same sentence in discussion =", max(number_words_in_sentence_discussion))
print("maximum words in the same sentence in conclusions =", max(number_words_in_sentence_conclusions))

"""
df_sentences_intro = pd.DataFrame({"citing_intro_sentence" : filtered_sentences_intro_list, 
                                 "number_frequent_words" : number_words_in_sentence_intro})
"""
# length of sentence

sentence_length_intro = []
for sentence in citing_sentences_intro_list:
    if sentence == None:
        sentence_length_intro.append(0)
    if sentence != None:
        sentence_length_intro.append(len(sentence))

df_sentences_intro = pd.DataFrame({"frequent_words" : number_words_in_sentence_intro,
                                "citing_sentence" : citing_sentences_original_intro,
                                "sentence_length": sentence_length_intro})


sentences_toCheck = df_sentences_intro[(df_sentences_intro["frequent_words"] == max(df_sentences_intro["frequent_words"]))] #['sentence_citing_intro']
min_length= min(sentences_toCheck["sentence_length"][:])
print("min_length: ", min_length)

sentence = sentences_toCheck[sentences_toCheck['sentence_length'] == min_length]["citing_sentence"]

# To print the full content
sentence.to_csv(sys.stdout)

1    et al.reported detection XMRV peripheral blood...
Name: sentence_citing_intro, dtype: object
Frequent words included (intro):  [5]
Frequent words included (maintext):  [5]
Frequent words included (discussion):  [0.0]
Frequent words included (conclusions):  [0.0]
maximum words in the same sentence in introduction = 5
maximum words in the same sentence in maintext = 5
maximum words in the same sentence in discussion = 0.0
maximum words in the same sentence in conclusions = 0.0
min_length:  265
1,"['In 2009, Lombardi<italic>et al.</italic>reported the detection of XMRV in both peripheral blood mononuclear cells (PBMC) and plasma of 67% of a CFS patient cohort compared to 3.7% in healthy controls<xref ref-type=""bibr"" rid=""pone.0027870-Lombardi1"">[5]</xref>.']"


In [13]:
df = pd.read_csv(citing_sections_tsv, sep='\t', encoding='utf-8')

citing_sentences_intro= df['sentence_citing_maintext']
citing_sentences_intro_list = citing_sentences_intro.tolist()
print(citing_sentences_intro_list[1])

count_vectorizer = CountVectorizer(max_features=5)
#sentence = ''.join(str(string) for string in citing_sentences_list)
#sentence = sentence.decode('utf-8')
#u_sentence = unicode( sentence, "utf-8" )
#backToBytes = u_sentence.encode( "utf-8" )



#sentence = re.sub(r',([0-9])', '\\1', sentence)
# sort out HMTL formatting of &
#sentence = re.sub(r'&amp', 'and', sentence)

# If you want to avoid to go through all the papers, you can select those with introductions by
# by replacing citing_sentences_intro_list by df_intro_filtered.tolist()


citing_sentences_original = deepcopy(citing_sentences_intro_list) # This is a list of lists, so you need deepcopy
#print(citing_sentences_original[19])
filtered_sentences_noNone = []
filtered_sentences = citing_sentences_intro_list[:]
print(filtered_sentences[0:1])
for sentence in filtered_sentences:
    if (type(sentence) == float):
        filtered_sentences_noNone.append(" ")
    if (type(sentence) != float):
        #print("######################### ", type(sentence), sentence)
        delete = ["Introduction", "Background", "the", "and"]
        for word in delete:
            sentence = re.sub(word, "", sentence) 
        sentence = re.sub("[\<\[].*?[\>\]]", "", sentence) #to remove cititations
        sentence = ' '.join([word for word in sentence.split() if word not in (stopwords.words('english'))])
        filtered_sentences_noNone.append(sentence)

count_vectorizer.fit_transform(filtered_sentences_noNone) #backToBytes
#print(citing_sentences_original[19]) 
    
print(count_vectorizer.get_feature_names())
count_vectors = count_vectorizer.transform(filtered_sentences_noNone)

word_frequency = count_vectors.toarray()
print(word_frequency)
number_words_in_sentence = np.sum(count_vectors.toarray(),axis=1).tolist()

print("Frequent words included: ", number_words_in_sentence)


#print(citing_sentences_intro_list)
print("maximum words in the same sentence =", max(number_words_in_sentence))

#df_introCiting = pd.DataFrame({"citing_intro_sentence" : filtered_sentences_noNone, 
#                                "number_frequent_words" : number_words_in_sentence})
#print(df_introCiting)

# length of sentence
sentence_length = []
for sentence in citing_sentences_intro_list:
    if (sentence == None) | (sentence == np.nan):
        sentence_length.append(0)
    if (sentence != None) & (sentence != np.nan):
        sentence_length.append(len(str(sentence)))



df_introCiting = pd.DataFrame({"frequent_words" : number_words_in_sentence,
                                "citing_sentence" : citing_sentences_original,
                                "sentence_length": sentence_length})


sentences_toCheck = df_introCiting[(df_introCiting["frequent_words"] == max(df_introCiting["frequent_words"]))] #['sentence_citing_intro']
min_length= min(sentences_toCheck["sentence_length"][:])

sentence = sentences_toCheck[sentences_toCheck['sentence_length'] == min_length]["citing_sentence"]

# To print the full content
sentence.to_csv(sys.stdout)

['For amplification of XMRV/MLV<italic>gag</italic>sequences, 5 µl of the transcribed cDNA were used for the first round of amplification with primers 419F (<named-content content-type="gene">5′-ATCAGTTAACCTACCCGAGTCGGAC-3′</named-content>) and 1154R (<named-content content-type="gene">5′-GCCGCCTCTTCTTCATTGTTCTC-3′</named-content>)<xref ref-type="bibr" rid="pone.0027870-Lombardi1">[5]</xref>and HotStart-IT FideliTaq Master Mix (USB) with the recommended component volumes.']
[nan]
['1154r', 'round', 'transcribed', 'usb', 'used']
[[0 0 0 0 0]
 [1 1 1 1 1]]
Frequent words included:  [0, 5]
maximum words in the same sentence = 5
1,"['For amplification of XMRV/MLV<italic>gag</italic>sequences, 5 µl of the transcribed cDNA were used for the first round of amplification with primers 419F (<named-content content-type=""gene"">5′-ATCAGTTAACCTACCCGAGTCGGAC-3′</named-content>) and 1154R (<named-content content-type=""gene"">5′-GCCGCCTCTTCTTCATTGTTCTC-3′</named-content>)<xref ref-type=""bibr"" rid

In [14]:
sid = SentimentIntensityAnalyzer()


df_sentiment = df_introCiting.copy()
#ss = sid.polarity_scores(df_sentiment['citing_sentence'][4][0]) #if None => error
#print(ss)

df_sentiment2 = filtered_sentences_noNone
#ss2 = sid.polarity_scores(df_sentiment2[2])
ss2 = sid.polarity_scores(df_sentiment2[0])

sentiment_scores_list = []
for sentence in df_sentiment2:
    ss2 = sid.polarity_scores(sentence) # from twython  package
    sentiment_scores_list.append(ss2)
df_sentiment_scores = pd.DataFrame(sentiment_scores_list)

    #df_sentiment_scores.append(ss2)
#df_sentiment2


df_sentiment = df_sentiment.join(df_sentiment_scores)

print(df_sentiment)
"""
for i, tweet in df_sentiment['text'].iteritems():
    ss = sid.polarity_scores(str(tweet))
    for k in sorted(ss):
        df_sentiment.loc[i, k] = ss[k]
"""

                                     citing_sentence  frequent_words  \
0                                                NaN               0   
1  ['For amplification of XMRV/MLV<italic>gag</it...               5   

   sentence_length  compound  neg    neu    pos  
0                3    0.0000  0.0  0.000  0.000  
1              478    0.2023  0.0  0.917  0.083  


"\nfor i, tweet in df_sentiment['text'].iteritems():\n    ss = sid.polarity_scores(str(tweet))\n    for k in sorted(ss):\n        df_sentiment.loc[i, k] = ss[k]\n"

In [None]:
x = np.arange(len(df_sentiment))
df_sentiment['compound']

#plt.close()
#comp = plt.scatter(x,df_sentiment['compound'])
pos = plt.scatter(x,df_sentiment['pos'], color = 'g')
neu = plt.scatter(x,df_sentiment['neu'], color = 'y')
neg = plt.scatter(x,df_sentiment['neg'], color = 'r')

plt.legend((pos, neu, neg),('positive', 'neutral','negative'), loc='center left', bbox_to_anchor=(1, 0.5))
plt.xlabel("List of paper", fontsize=14)
plt.ylabel("Sentiment score", fontsize=14)

plt.savefig("sentiment.png")
plt.show()
