***First Proposed Idea***

Create a RNN that predicts at the sentence level whether there is a dataset or not.

In [1]:
import torch
import json
import os
import pandas as pd
import numpy as np
import nltk
import re

In [2]:
from nltk.corpus import stopwords, words
from collections import defaultdict

In [3]:
WORD_DIM = 100
STOP_WORDS = list() # stopwords.words('english')
WD = "../"
TRAIN_DIR = WD + "train/"
TEST_DIR  = WD + "test/"

In [4]:
class CorpusPipeline:
    """
    A class to transform a corpus into a list of features.
    
    ...
    
    Attributes
    ----------
    word_dim : int
        number of words to be held in a fixed size array
    stop_words : list
        list of stop words to omit from sentence
    """
    
    def __init__(self, word_dim, stop_words):
        self.word_dim = word_dim
        self.stop_words = set(stop_words)
        self.word_dict = defaultdict(lambda: 0)

    def clean_text(self, txt):
        """Return cleaned text according to regex"""
        return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower())
    
    def remove_stop_words(self, word_arr):
        """Return a word array without stop words"""
        return [word for word in word_arr if word not in self.stop_words]
    
    def fit(self, corpora):
        """Creates a word to int dictionary from a corpora"""
        
        # Create word to int dictionary
        full_text = ' '.join(corpora)
        words = nltk.word_tokenize(self.clean_text(full_text))
        i = 1
        for word in set(words):
            self.word_dict[word] = i
            i += 1
        
    
    def get_features_labels(self, corpus: str, labels: list):
        """Return a corpus represented as a list of features"""
        x = nltk.sent_tokenize(corpus)
        features = np.zeros(self.word_dim)
        labels   = np.zeros(self.word_dim)
        
        x = self.clean_text(corpus)
        x = nltk.word_tokenize(x)
        x = self.remove_stop_words(x)
#         x = [self.word_dict[word] for word in x]

        y = self.clean_text()
        
        features[0:min(len(x), self.word_dim)] = x[0:min(len(x), self.word_dim)]
        
        return features
    
    def transform(self, df, text_column, feature_column_name='features'):
        return df[text_column].apply(lambda x: self.get_features(x))

In [124]:
a = ['hello', 'my', 'name', 'is', 'daniel']
b = ['my', 'name', 'is']


AttributeError: 'list' object has no attribute 'to_str'

In [5]:
def load_data(path):
    """Iterate through every file in a path and transform into a dataframe with notebook id, section title and text"""
    ids = os.listdir(path)
    return [[ID[0:-5]] + [i[1] for i in section.items()] for ID in ids for section in json.load(open(path + '/' + ID))]

In [6]:
train_data   = pd.DataFrame(load_data(TRAIN_DIR), columns=['Id', 'section_title', 'text']).astype(str)
train_labels = pd.read_csv('../train.csv').astype(str)

In [29]:
train_data

Unnamed: 0,Id,section_title,text
0,178d9a89-ca76-456d-bf9e-5b80eaae2d49,Introduction,There are many assessment tools in social scie...
1,178d9a89-ca76-456d-bf9e-5b80eaae2d49,2017). J. Educ. Sci Environ Health,
2,178d9a89-ca76-456d-bf9e-5b80eaae2d49,The Primary Stages Followed in Scale Adaptatio...,"Preparation Stage: In this stage, the research..."
3,178d9a89-ca76-456d-bf9e-5b80eaae2d49,Confirmatory Factor Analysis (CFA):,This analysis is used for scale adaptation and...
4,178d9a89-ca76-456d-bf9e-5b80eaae2d49,The Studies Conducted in Turkey,By using thematic content analysis (meta-synth...
...,...,...,...
258709,0046e50c-6d19-4f6b-a7c5-4fe935d3b6f8,Results,We first performed a preliminary analysis by u...
258710,0046e50c-6d19-4f6b-a7c5-4fe935d3b6f8,The Alzheimer's Disease Neuroimaging Initiative,
258711,0046e50c-6d19-4f6b-a7c5-4fe935d3b6f8,Imaging Genetic Data,Imaging genetics is an emergent trans-discipli...
258712,0046e50c-6d19-4f6b-a7c5-4fe935d3b6f8,Discussion,We have developed a Bayesian analysis GLRR to ...


In [108]:
grouped_dataset_labels = train_labels[['Id','cleaned_label']] \
                                    .groupby('Id') \
                                    .apply(lambda x: list(x['cleaned_label'])) \
                                    .to_frame() \
                                    .reset_index()
grouped_dataset_labels.columns = ['Id', 'cleaned_labels']

In [114]:
df = grouped_dataset_labels.merge(train_data, left_on='Id', right_on='Id')

Unnamed: 0,Id,cleaned_labels,section_title,text
0,0007f880-0a9b-492d-9a58-76eb0b0e0bd7,[program for the international assessment of a...,Abstract,The aim of this study was to identify if acqui...
1,0007f880-0a9b-492d-9a58-76eb0b0e0bd7,[program for the international assessment of a...,Introduction,The spur of ICT (Information and Communication...
2,0007f880-0a9b-492d-9a58-76eb0b0e0bd7,[program for the international assessment of a...,English and ICT,Studies continue to show that primary language...
3,0007f880-0a9b-492d-9a58-76eb0b0e0bd7,[program for the international assessment of a...,Lebanon,Lebanon is a small country situated in the Mid...
4,0007f880-0a9b-492d-9a58-76eb0b0e0bd7,[program for the international assessment of a...,Gender Inequality in Lebanon,Lebanese women are in a constant battle with a...
...,...,...,...,...
258709,ffeb3568-7aed-4dbe-b177-cbd7f46f34af,[trends in international mathematics and scien...,The role of the status of mathematics,The factor that distinguishes mathematics from...
258710,ffeb3568-7aed-4dbe-b177-cbd7f46f34af,[trends in international mathematics and scien...,Trampoline or self-fulfilling prophecy?,Abandoning mathematics can be a rational strat...
258711,ffeb3568-7aed-4dbe-b177-cbd7f46f34af,[trends in international mathematics and scien...,Summary,The aim of this article was to reconstruct the...
258712,ffeb3568-7aed-4dbe-b177-cbd7f46f34af,[trends in international mathematics and scien...,Literature,


In [7]:
# cpipe = CorpusPipeline(WORD_DIM, STOP_WORDS)

In [8]:
cpipe.fit(train_data['text'].to_numpy())

In [9]:
len(cpipe.word_dict)

532376

In [None]:
cpipe.transform(train_data[0:100], 'text')

In [161]:
' '.join(df['text'].to_numpy())

In [140]:
cpipe.transform(train[0:10], 'text')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [53]:
df = pd.read_csv('../train.csv')

"['hello', 'man']"

In [14]:
json.load(open(WD + train_files[0]))

NameError: name 'train_files' is not defined

In [3]:
pd.read_csv('sample_submission.csv')

Unnamed: 0,Id,PredictionString
0,2100032a-7c33-4bff-97ef-690822c43466,
1,2f392438-e215-4169-bebf-21ac4ff253e1,
2,3f316b38-1a24-45a9-8d8c-4e05a42257c6,
3,8e6996b4-ca08-4c0b-bed2-aaf07a4c6a60,


In [6]:
pd.read_csv('train.csv').sort_values('Id')

Unnamed: 0,Id,pub_title,dataset_title,dataset_label,cleaned_label
1582,0007f880-0a9b-492d-9a58-76eb0b0e0bd7,The Impact of ICT Training on Income Generatio...,Program for the International Assessment of Ad...,Program for the International Assessment of Ad...,program for the international assessment of ad...
7736,0008656f-0ba2-4632-8602-3017b44c2e90,Finnish Ninth Graders’ Gender Appropriateness ...,Trends in International Mathematics and Scienc...,Trends in International Mathematics and Scienc...,trends in international mathematics and scienc...
16193,000e04d6-d6ef-442f-b070-4309493221ba,Economic Research Service: Specialized Agency...,Agricultural Resource Management Survey,Agricultural Resources Management Survey,agricultural resources management survey
4511,000efc17-13d8-433d-8f62-a3932fe4f3b8,Risk factors and global cognitive status relat...,Alzheimer's Disease Neuroimaging Initiative (A...,ADNI,adni
10461,000efc17-13d8-433d-8f62-a3932fe4f3b8,Risk factors and global cognitive status relat...,Alzheimer's Disease Neuroimaging Initiative (A...,Alzheimer's Disease Neuroimaging Initiative (A...,alzheimer s disease neuroimaging initiative adni
...,...,...,...,...,...
10385,ffd4d86a-0f26-44cc-baed-f0e209cc22af,A Spherical Brain Mapping of MR Images for the...,Alzheimer's Disease Neuroimaging Initiative (A...,Alzheimer's Disease Neuroimaging Initiative (A...,alzheimer s disease neuroimaging initiative adni
19343,ffe7f334-245a-4de7-b600-d7ff4e28bfca,COVID-19 and Possible Pharmacological Preventi...,SARS-CoV-2 genome sequence,genome sequences of SARS-CoV-2,genome sequences of sars cov 2
7667,ffeb3568-7aed-4dbe-b177-cbd7f46f34af,Abandoning mathematics. Reconstructing the pro...,Trends in International Mathematics and Scienc...,Trends in International Mathematics and Scienc...,trends in international mathematics and scienc...
11079,ffee2676-a778-4521-b947-e1e420b126c5,A different viewpoint on student retention,Beginning Postsecondary Student,Beginning Postsecondary Students,beginning postsecondary students
