In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('topic_modeling.csv')

In [3]:
data.head()

Unnamed: 0,Cluster_Topic,PY,Topic_Year,PT,AF,PU,FU,PA,PG
0,6,2009,6_2009,J,"DeWit, Matthew A.; Gillies, Elizabeth R.",AMER CHEMICAL SOC,Natural Sciences and Engineering Council of Ca...,"1155 16TH ST, NW, WASHINGTON, DC 20036 USA",8
1,2,2009,2_2009,J,"Zhang, Huigang; Zhu, Qingshan",WORLD SCIENTIFIC PUBL CO PTE LTD,,"5 TOH TUCK LINK, SINGAPORE 596224, SINGAPORE",10
2,5,2009,5_2009,J,"Yoshida, Mutsumi; Roh, Kyung-Ho; Mandal, Supar...",WILEY-V C H VERLAG GMBH,,"POSTFACH 101161, 69451 WEINHEIM, GERMANY",7
3,9,2009,9_2009,J,"Tsou, Hsi-Kai; Hsieh, Ping-Yen; Chung, Chi-Jen...",ELSEVIER SCIENCE SA,,"PO BOX 564, 1001 LAUSANNE, SWITZERLAND",5
4,13,2009,13_2009,J,"Doshi, Nishit; Mitragotri, Samir",WILEY-V C H VERLAG GMBH,National Institute of Health [1U01HL080718],"PO BOX 10 11 61, D-69451 WEINHEIM, GERMANY",12


In [4]:
data.isnull().sum()

Cluster_Topic        0
PY                   0
Topic_Year           0
PT                   0
AF                   2
PU                   1
FU               19669
PA                   1
PG                   0
dtype: int64

# Top authors in each year
- For each year, extract the top authors who contributed the most in the top list in terms of the number of publications in previous years.
- For instance, to find the top authors in 2013, we should consider top authors who published before 2013 rather than only 2013.

In [5]:
# Impute missing values in 'AF' column.
data['AF'].fillna('NA author', inplace=True)

In [6]:
data['All_authors'] = data.AF.str.split(';')

In [7]:
data['All_authors'].head()

0          [DeWit, Matthew A.,  Gillies, Elizabeth R.]
1                     [Zhang, Huigang,  Zhu, Qingshan]
2    [Yoshida, Mutsumi,  Roh, Kyung-Ho,  Mandal, Su...
3    [Tsou, Hsi-Kai,  Hsieh, Ping-Yen,  Chung, Chi-...
4                  [Doshi, Nishit,  Mitragotri, Samir]
Name: All_authors, dtype: object

In [8]:
# Make sure no missing value in 'AF' and 'All_authors'.
data.isnull().sum()

Cluster_Topic        0
PY                   0
Topic_Year           0
PT                   0
AF                   0
PU                   1
FU               19669
PA                   1
PG                   0
All_authors          0
dtype: int64

In [9]:
def top_author_in_year(year,index,dataframe):
    '''
    year: a target year.
    index: an index number to determine how many top authors should be extracted.
    dataframe: data stored in a pandas frame.
    Output: Return a list of top authors before a target year given an index number.
    '''
    
    df = dataframe
    grouped_df_in_year = df[df['PY'] <= year]
    author_in_year = grouped_df_in_year['All_authors']
    author_count_dic = {}
    for authors in author_in_year:
        for author in authors:
            if author not in author_count_dic:
                author_count_dic[author.strip().replace('.','')] = 1
            else: 
                author_count_dic[author.strip().replace('.','')] += 1
    sorted_author_count_dic = sorted(author_count_dic.items(), key=lambda kv: kv[1])[::-1]
    top_authors = []
    for i in range(index):
        top_authors.append(sorted_author_count_dic[i][0])
        
    # Consider 'tied' authors.
    for each in sorted_author_count_dic[index:]:
        if each[1] == sorted_author_count_dic[index-1][1]:
            top_authors.append(each[0])
        
    return top_authors

In [10]:
# Collect top authors in each year.
year = []
top_authors = []
for i in range(2001, 2020):
    year.append(i)
    top_authors.append(top_author_in_year(year=i-1,index=10,dataframe=data))

In [11]:
top_author_df = pd.DataFrame({'PY': year, 'top_authors_in_previous_yrs': top_authors})
top_author_df.head()

Unnamed: 0,PY,top_authors_in_previous_yrs
0,2001,"[BRUCK, SD, [Anonymous], HEIMKE, G, DANIELS, A..."
1,2002,"[[Anonymous], BRUCK, SD, Bellon, JM, HEIMKE, G..."
2,2003,"[[Anonymous], BRUCK, SD, HEIMKE, G, DANIELS, A..."
3,2004,"[[Anonymous], BRUCK, SD, HEIMKE, G, DANIELS, A..."
4,2005,"[[Anonymous], BRUCK, SD, HEIMKE, G, DANIELS, A..."


In [12]:
# Make sure collect all top authors from every year.
top_author_df.PY.describe()[['min','max']]

min    2001.0
max    2019.0
Name: PY, dtype: float64

# Top publishers in each year
- For each year, extract top publishers which published the most in the top list in terms of the number of publications in previous years.

In [13]:
# Impute missing values in 'PU' column.
data['PU'].fillna('NA publisher', inplace=True)

In [14]:
# Make sure no missing data in 'PU'.
data['PU'].isnull().sum()

0

In [15]:
def top_publisher_in_year(year,index,dataframe):
    '''
    year: a target year.
    index: an index number to determine how many top publishers should be extracted.
    dataframe: data stored in a pandas frame.
    Output: Return a list of top publishers before a target year given an index number.
    '''
    
    df = dataframe
    grouped_df_in_year = df[df['PY'] <= year]
    publisher_in_year = grouped_df_in_year['PU']
    publisher_count_dic = {}
    for publisher in publisher_in_year:
        if publisher not in publisher_count_dic:
            publisher_count_dic[publisher] = 1
        else: 
            publisher_count_dic[publisher] += 1
    
    sorted_publisher_count_dic = sorted(publisher_count_dic.items(), key=lambda kv: kv[1])[::-1]
    top_publishers = []
    for i in range(index):
        top_publishers.append(sorted_publisher_count_dic[i][0])
        
    # Consider 'tied' authors.
    for each in sorted_publisher_count_dic[index:]:
        if each[1] == sorted_publisher_count_dic[index-1][1]:
            top_publishers.append(each[0])
        
    return top_publishers

In [16]:
# Collect top publishers in each year.
year = []
top_publishers = []
for i in range(2001, 2020):
    year.append(i)
    top_publishers.append(top_publisher_in_year(year=i-1,index=10,dataframe=data))

In [17]:
top_publisher_df = pd.DataFrame({'PY': year, 'top_publishers_in_previous_yrs': top_publishers})

In [18]:
top_publisher_df.head()

Unnamed: 0,PY,top_publishers_in_previous_yrs
0,2001,"[JOHN WILEY & SONS INC, ELSEVIER SCI LTD, AMER..."
1,2002,"[ELSEVIER SCI LTD, JOHN WILEY & SONS INC, AMER..."
2,2003,"[ELSEVIER SCI LTD, JOHN WILEY & SONS INC, AMER..."
3,2004,"[ELSEVIER SCI LTD, JOHN WILEY & SONS INC, AMER..."
4,2005,"[ELSEVIER SCI LTD, JOHN WILEY & SONS INC, AMER..."


In [19]:
# Make sure collect all top publishers from every year.
top_publisher_df.PY.describe()[['min','max']]

min    2001.0
max    2019.0
Name: PY, dtype: float64

# Top funding agencies in each year
- For each year, extract the top funding agencies which funded the most in the top list in terms of the number of publications in previous years.

In [20]:
# Impute missing values in 'AF' column.
data['FU'].fillna('NA funding agency', inplace=True)

In [21]:
data['All_funding_agencies'] = data.FU.str.split(';')

In [22]:
data['All_funding_agencies'].head()

0    [Natural Sciences and Engineering Council of C...
1                                  [NA funding agency]
2                                  [NA funding agency]
3                                  [NA funding agency]
4        [National Institute of Health [1U01HL080718]]
Name: All_funding_agencies, dtype: object

In [23]:
# Make sure no missing data in 'FU' and 'All_funding_agencies'.
data.isnull().sum()

Cluster_Topic           0
PY                      0
Topic_Year              0
PT                      0
AF                      0
PU                      0
FU                      0
PA                      1
PG                      0
All_authors             0
All_funding_agencies    0
dtype: int64

In [24]:
def top_funding_agency_in_year(year,index,dataframe):
    '''
    year: a target year.
    index: an index number to determine how many top funding agencies should be extracted.
    dataframe: data stored in a pandas frame.
    Output: Return a list of top funding agencies before a target year given an index number.
    '''
    
    df = dataframe
    grouped_df_in_year = df[df['PY'] <= year]
    funding_agency_in_year = grouped_df_in_year['All_funding_agencies']
    funding_agency_count_dic = {}
    
    for funding_agencies in funding_agency_in_year:
        for funding_agency in funding_agencies:
            if funding_agency not in funding_agency_count_dic:
                funding_agency_count_dic[funding_agency] = 1
            else: 
                funding_agency_count_dic[funding_agency] += 1
                
    sorted_funding_agency_count_dic = sorted(funding_agency_count_dic.items(), key=lambda kv: kv[1])[::-1]
    top_funding_agencies = []
    
    for i in range(index):
        top_funding_agencies.append(sorted_funding_agency_count_dic[i][0])
        
    # Consider 'tied' authors.
    for each in sorted_funding_agency_count_dic[index:]:
        if each[1] == sorted_funding_agency_count_dic[index-1][1]:
            top_funding_agencies.append(each[0])
        
    return top_funding_agencies

In [25]:
# Collect top funding agencies in each year.
year = []
top_funding_agencies = []
for i in range(2001, 2020):
    year.append(i)
    top_funding_agencies.append(top_funding_agency_in_year(year=i-1,index=10,dataframe=data))

In [26]:
top_funding_agency_df = pd.DataFrame({'PY': year, 'top_funding_agency_in_previous_yrs': top_funding_agencies})

In [27]:
top_funding_agency_df.head()

Unnamed: 0,PY,top_funding_agency_in_previous_yrs
0,2001,"[NA funding agency, Wellcome Trust, NHLBI NIH ..."
1,2002,"[NA funding agency, Wellcome Trust, NCRR NIH H..."
2,2003,"[NA funding agency, NCRR NIH HHS [RR01296], We..."
3,2004,"[NA funding agency, NCRR NIH HHS [RR01296], We..."
4,2005,"[NA funding agency, NCRR NIH HHS [RR01296], We..."


# Have funding agency
- Because originally, there are many missing data in 'FU'. Create a boolean type of variable to indicate whether there is a funding agency or not.

In [28]:
def have_funding_agency(x):
    if x == 'NA funding agency':
        return 0
    else:
        return 1

In [29]:
data['Have_Funding_Agency'] = data['FU'].apply(have_funding_agency)

In [30]:
data[['FU','Have_Funding_Agency']].head(10)

Unnamed: 0,FU,Have_Funding_Agency
0,Natural Sciences and Engineering Council of Ca...,1
1,NA funding agency,0
2,NA funding agency,0
3,NA funding agency,0
4,National Institute of Health [1U01HL080718],1
5,NA funding agency,0
6,French-Australian Science and Technology Progr...,1
7,National Heart Lung and Blood Institute's Prog...,1
8,JICA AUN/SEED net,1
9,National Science Foundation; National Institut...,1


# Count of author
- Publications might have several authors so perhaps more authors could bring more attentation on their publications.

In [31]:
data['Author_count'] = data['All_authors'].apply(len)

In [32]:
data[['All_authors','Author_count']].head(10)

Unnamed: 0,All_authors,Author_count
0,"[DeWit, Matthew A., Gillies, Elizabeth R.]",2
1,"[Zhang, Huigang, Zhu, Qingshan]",2
2,"[Yoshida, Mutsumi, Roh, Kyung-Ho, Mandal, Su...",8
3,"[Tsou, Hsi-Kai, Hsieh, Ping-Yen, Chung, Chi-...",6
4,"[Doshi, Nishit, Mitragotri, Samir]",2
5,"[Bedi, Rajwant S., Zanello, Laura P., Yan, Y...",3
6,"[Ting, S. R. Simon, Min, Eun Hee, Escale, Pi...",6
7,"[Doshi, Nishit, Zahr, Alisar S., Bhaskar, Sr...",5
8,"[Arre Toque, Jay, Hamdi, M., Ide-Ektessabi, ...",4
9,"[Caldorera-Moore, Mary, Peppas, Nicholas A.]",2


# Count of funding agency
- Number of funding agency. Probably, more funding agencies to sponsor a publication could bring more attentation on it.

In [33]:
def count_funding_agency(x):
    if x == ['NA funding agency']:
        return 0
    else:
        return len(x)

In [34]:
data['Funding_agency_count'] = data['All_funding_agencies'].apply(count_funding_agency)

In [35]:
data[['Funding_agency_count', 'All_funding_agencies']].head(10)

Unnamed: 0,Funding_agency_count,All_funding_agencies
0,3,[Natural Sciences and Engineering Council of C...
1,0,[NA funding agency]
2,0,[NA funding agency]
3,0,[NA funding agency]
4,1,[National Institute of Health [1U01HL080718]]
5,0,[NA funding agency]
6,1,[French-Australian Science and Technology Prog...
7,3,[National Heart Lung and Blood Institute's Pro...
8,1,[JICA AUN/SEED net]
9,2,"[National Science Foundation, National Instit..."


# Publisher
- Lower down the levels in Publisher into seven levels by only considering top publishers in recent years from 2008 to 2018.

In [36]:
def squeeze_publisher(x):
    top_publishers = ['ELSEVIER SCI LTD', 'ELSEVIER SCIENCE BV', 'AMER CHEMICAL SOC', 'WILEY',\
                      'SPRINGER', 'WILEY-BLACKWELL', 'ROYAL SOC CHEMISTRY', 'WILEY-V C H VERLAG GMBH',\
                      'ELSEVIER SCIENCE SA', 'PERGAMON-ELSEVIER SCIENCE LTD']
    if x.strip().upper() in top_publishers:
        return x.strip().upper()
    else:
        return 'Other Publishers'

In [37]:
data['Squeezed_Publisher'] = data.PU.apply(squeeze_publisher)

In [38]:
data[['PU','Squeezed_Publisher']].head(10)

Unnamed: 0,PU,Squeezed_Publisher
0,AMER CHEMICAL SOC,AMER CHEMICAL SOC
1,WORLD SCIENTIFIC PUBL CO PTE LTD,Other Publishers
2,WILEY-V C H VERLAG GMBH,WILEY-V C H VERLAG GMBH
3,ELSEVIER SCIENCE SA,ELSEVIER SCIENCE SA
4,WILEY-V C H VERLAG GMBH,WILEY-V C H VERLAG GMBH
5,WILEY-V C H VERLAG GMBH,WILEY-V C H VERLAG GMBH
6,AMER CHEMICAL SOC,AMER CHEMICAL SOC
7,NATL ACAD SCIENCES,Other Publishers
8,WORLD SCIENTIFIC PUBL CO PTE LTD,Other Publishers
9,ELSEVIER SCIENCE BV,ELSEVIER SCIENCE BV


In [39]:
# Make sure the transformation is correct!
# Must be 11 levels.
len(data.Squeezed_Publisher.unique())

11

# Country (Skipped this part)

In [None]:
# Impute missing values in 'PA' column.
data['PA'].fillna('NA country', inplace=True)

In [None]:
def extract_country(x):
    target_country = ('usa','china','germany','japan','england','italy','korea',\
                      'france','india','canada','spain','australia','netherlands','portugal',\
                      'switzerland','sweden')
    split_country = x.strip().lower().split(' ')
    if any(each in target_country for each in split_country):
        if split_country[-1] in target_country:
            return split_country[-1].upper()
        elif split_country[-2] in target_country:
            return split_country[-2].upper()
        elif split_country[-3] in target_country:
            return split_country[-3].upper()
    else:
        return 'Other countries'
        
        #search_country = x.split(' ')
        #total_len = len(search_country)
        #for i in range(total_len):
            #if search_country[i] in target_country:
                #final_country = search_country[i]
                
                #return final_country

# Final Data

In [40]:
# First merge the original dataframe with new feature each by each on the key 'PY' (or year).
print(top_author_df.head())
print(top_publisher_df.head())
print(top_funding_agency_df.head())

     PY                        top_authors_in_previous_yrs
0  2001  [BRUCK, SD, [Anonymous], HEIMKE, G, DANIELS, A...
1  2002  [[Anonymous], BRUCK, SD, Bellon, JM, HEIMKE, G...
2  2003  [[Anonymous], BRUCK, SD, HEIMKE, G, DANIELS, A...
3  2004  [[Anonymous], BRUCK, SD, HEIMKE, G, DANIELS, A...
4  2005  [[Anonymous], BRUCK, SD, HEIMKE, G, DANIELS, A...
     PY                     top_publishers_in_previous_yrs
0  2001  [JOHN WILEY & SONS INC, ELSEVIER SCI LTD, AMER...
1  2002  [ELSEVIER SCI LTD, JOHN WILEY & SONS INC, AMER...
2  2003  [ELSEVIER SCI LTD, JOHN WILEY & SONS INC, AMER...
3  2004  [ELSEVIER SCI LTD, JOHN WILEY & SONS INC, AMER...
4  2005  [ELSEVIER SCI LTD, JOHN WILEY & SONS INC, AMER...
     PY                 top_funding_agency_in_previous_yrs
0  2001  [NA funding agency, Wellcome Trust, NHLBI NIH ...
1  2002  [NA funding agency, Wellcome Trust, NCRR NIH H...
2  2003  [NA funding agency, NCRR NIH HHS [RR01296], We...
3  2004  [NA funding agency, NCRR NIH HHS [RR01296], We.

In [41]:
# Shape before merging.
data.shape

(45632, 15)

In [42]:
new_data = pd.merge(data, top_author_df, on='PY', how='left')
new_data = pd.merge(new_data, top_publisher_df, on='PY', how='left')
new_data = pd.merge(new_data, top_funding_agency_df, on='PY', how='left')
new_data.shape

(45632, 18)

In [43]:
new_data.head()

Unnamed: 0,Cluster_Topic,PY,Topic_Year,PT,AF,PU,FU,PA,PG,All_authors,All_funding_agencies,Have_Funding_Agency,Author_count,Funding_agency_count,Squeezed_Publisher,top_authors_in_previous_yrs,top_publishers_in_previous_yrs,top_funding_agency_in_previous_yrs
0,6,2009,6_2009,J,"DeWit, Matthew A.; Gillies, Elizabeth R.",AMER CHEMICAL SOC,Natural Sciences and Engineering Council of Ca...,"1155 16TH ST, NW, WASHINGTON, DC 20036 USA",8,"[DeWit, Matthew A., Gillies, Elizabeth R.]",[Natural Sciences and Engineering Council of C...,1,2,3,AMER CHEMICAL SOC,"[[Anonymous], BRUCK, SD, Tsuji, H, Wood, Jonat...","[ELSEVIER SCI LTD, AMER CHEMICAL SOC, ELSEVIER...","[NA funding agency, Wellcome Trust, NCRR NIH H..."
1,2,2009,2_2009,J,"Zhang, Huigang; Zhu, Qingshan",WORLD SCIENTIFIC PUBL CO PTE LTD,NA funding agency,"5 TOH TUCK LINK, SINGAPORE 596224, SINGAPORE",10,"[Zhang, Huigang, Zhu, Qingshan]",[NA funding agency],0,2,0,Other Publishers,"[[Anonymous], BRUCK, SD, Tsuji, H, Wood, Jonat...","[ELSEVIER SCI LTD, AMER CHEMICAL SOC, ELSEVIER...","[NA funding agency, Wellcome Trust, NCRR NIH H..."
2,5,2009,5_2009,J,"Yoshida, Mutsumi; Roh, Kyung-Ho; Mandal, Supar...",WILEY-V C H VERLAG GMBH,NA funding agency,"POSTFACH 101161, 69451 WEINHEIM, GERMANY",7,"[Yoshida, Mutsumi, Roh, Kyung-Ho, Mandal, Su...",[NA funding agency],0,8,0,WILEY-V C H VERLAG GMBH,"[[Anonymous], BRUCK, SD, Tsuji, H, Wood, Jonat...","[ELSEVIER SCI LTD, AMER CHEMICAL SOC, ELSEVIER...","[NA funding agency, Wellcome Trust, NCRR NIH H..."
3,9,2009,9_2009,J,"Tsou, Hsi-Kai; Hsieh, Ping-Yen; Chung, Chi-Jen...",ELSEVIER SCIENCE SA,NA funding agency,"PO BOX 564, 1001 LAUSANNE, SWITZERLAND",5,"[Tsou, Hsi-Kai, Hsieh, Ping-Yen, Chung, Chi-...",[NA funding agency],0,6,0,ELSEVIER SCIENCE SA,"[[Anonymous], BRUCK, SD, Tsuji, H, Wood, Jonat...","[ELSEVIER SCI LTD, AMER CHEMICAL SOC, ELSEVIER...","[NA funding agency, Wellcome Trust, NCRR NIH H..."
4,13,2009,13_2009,J,"Doshi, Nishit; Mitragotri, Samir",WILEY-V C H VERLAG GMBH,National Institute of Health [1U01HL080718],"PO BOX 10 11 61, D-69451 WEINHEIM, GERMANY",12,"[Doshi, Nishit, Mitragotri, Samir]",[National Institute of Health [1U01HL080718]],1,2,1,WILEY-V C H VERLAG GMBH,"[[Anonymous], BRUCK, SD, Tsuji, H, Wood, Jonat...","[ELSEVIER SCI LTD, AMER CHEMICAL SOC, ELSEVIER...","[NA funding agency, Wellcome Trust, NCRR NIH H..."


In [48]:
def contain_top_feature(feature_name,top_feature_name,dataframe):
    total_len = dataframe[dataframe.PY>2000].shape[0]
    filtered_df = dataframe[dataframe.PY>2000]
    filtered_df = filtered_df.reset_index()
    
    index_contain_top_feature = []
    boolean_value = []
    for i in range(total_len):
        total_features = filtered_df[feature_name][i]
        search_features = filtered_df[top_feature_name][i]
        for feat in total_features:
            if feat in search_features:
                index_contain_top_feature.append(i)
                boolean_value.append(1)
                break
            else:
                continue
    contain_top_feature_df = pd.DataFrame({'Contain_Top_'+feature_name:boolean_value},\
                                          index=index_contain_top_feature)
    return contain_top_feature_df

In [49]:
contain_top_author_df = contain_top_feature(feature_name='All_authors',
                                            top_feature_name='top_authors_in_previous_yrs',
                                            dataframe=new_data)
contain_top_fund_agency_df = contain_top_feature(feature_name='All_funding_agencies',
                                            top_feature_name='top_funding_agency_in_previous_yrs',
                                            dataframe=new_data)

In [53]:
target_df = new_data[new_data.PY>2000]
target_df = target_df.reset_index()
target_df = target_df.merge(contain_top_author_df, how='left', left_index=True, right_index=True)
target_df = target_df.merge(contain_top_fund_agency_df, how='left', left_index=True, right_index=True)

In [56]:
target_df.drop('index',axis=1,inplace=True)

In [65]:
def contain_top_feature(feature_name,top_feature_name,dataframe):
    total_len = dataframe[dataframe.PY>2000].shape[0]
    filtered_df = dataframe[dataframe.PY>2000]
    filtered_df = filtered_df.reset_index()
    
    index_contain_top_feature = []
    boolean_value = []
    for i in range(total_len):
        total_features = filtered_df[feature_name][i]
        search_features = filtered_df[top_feature_name][i]
        if total_features in search_features:
            index_contain_top_feature.append(i)
            boolean_value.append(1)
            
    contain_top_feature_df = pd.DataFrame({'Contain_Top_'+feature_name:boolean_value},\
                                          index=index_contain_top_feature)
    return contain_top_feature_df

In [66]:
contain_top_publisher_df = contain_top_feature(feature_name='PU',
                                               top_feature_name='top_publishers_in_previous_yrs',
                                               dataframe=new_data)

In [69]:
target_df = target_df.merge(contain_top_publisher_df, how='left', left_index=True, right_index=True)

In [73]:
target_df['Contain_Top_All_authors'].fillna(0, inplace=True)
target_df['Contain_Top_All_funding_agencies'].fillna(0, inplace=True)
target_df['Contain_Top_PU'].fillna(0, inplace=True)

In [74]:
target_df.isnull().sum()

Cluster_Topic                         0
PY                                    0
Topic_Year                            0
PT                                    0
AF                                    0
PU                                    0
FU                                    0
PA                                    0
PG                                    0
All_authors                           0
All_funding_agencies                  0
Have_Funding_Agency                   0
Author_count                          0
Funding_agency_count                  0
Squeezed_Publisher                    0
top_authors_in_previous_yrs           0
top_publishers_in_previous_yrs        0
top_funding_agency_in_previous_yrs    0
Contain_Top_All_authors               0
Contain_Top_All_funding_agencies      0
Contain_Top_PU                        0
dtype: int64

In [81]:
# Drop off columns
drop_cols = ['AF','PU','FU','PA','All_authors','All_funding_agencies',\
             'top_authors_in_previous_yrs','top_publishers_in_previous_yrs','top_funding_agency_in_previous_yrs']
target_df.drop(drop_cols, axis=1,inplace=True)

In [82]:
target_df.shape

(42293, 12)

In [83]:
target_df.head()

Unnamed: 0,Cluster_Topic,PY,Topic_Year,PT,PG,Have_Funding_Agency,Author_count,Funding_agency_count,Squeezed_Publisher,Contain_Top_All_authors,Contain_Top_All_funding_agencies,Contain_Top_PU
0,6,2009,6_2009,J,8,1,2,3,AMER CHEMICAL SOC,0.0,0.0,1.0
1,2,2009,2_2009,J,10,0,2,0,Other Publishers,0.0,1.0,0.0
2,5,2009,5_2009,J,7,0,8,0,WILEY-V C H VERLAG GMBH,0.0,1.0,1.0
3,9,2009,9_2009,J,5,0,6,0,ELSEVIER SCIENCE SA,0.0,1.0,1.0
4,13,2009,13_2009,J,12,1,2,1,WILEY-V C H VERLAG GMBH,0.0,0.0,1.0


In [84]:
target_df.to_csv('final_data.csv',index=False)