In [6]:
!pip install transformers
!pip install torch torchvision
!pip install sentencepiece



In [7]:
import pandas as pd
import numpy as np 
import nltk
from nltk import WordPunctTokenizer
nltk.download('stopwords')
from nltk.corpus import stopwords
import seaborn as sns
import os
import torch
from transformers import CamembertModel, CamembertTokenizer, CamembertConfig
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from sklearn.pipeline import FeatureUnion, Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
import torch.nn as nn 
from skimage import io, transform
from torch.utils.data import Dataset, DataLoader
from torchvision import models, transforms, utils
import random
import cv2
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_class_weight

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
from google.colab import drive,files
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
!unzip /content/drive/MyDrive/DS2021/Datasets/archive\(8\).zip 

Archive:  /content/drive/MyDrive/DS2021/Datasets/archive(8).zip
replace X_test_update.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: N
N
N
N


In [10]:
x_train = pd.read_csv("/content/X_train_update.csv")
y_train = pd.read_csv("/content/Y_train_CVw08PX.csv")
x_test = pd.read_csv("/content/X_test_update.csv")

In [11]:
x_train.rename(columns = {"Unnamed: 0": "Id"}, inplace = True)
y_train.rename(columns = {"Unnamed: 0": "Id"}, inplace = True)
x_test.rename(columns = {"Unnamed: 0": "Id"}, inplace = True)

Helpers function, Classes

```
# This is formatted as code
```



In [12]:
class simple_Text_cleaner(BaseEstimator, TransformerMixin):
  # add another additional parameter, just for fun, while we are at it
    def __init__(self, stopwords,columns,tokenizer=WordPunctTokenizer()): 
        self.columns = columns
        self.stopwords = stopwords
        self.tokenizer=tokenizer
    def rm_stopwords(self,tokens):
        tokens=self.tokenizer.tokenize(tokens)
        return [ tk for tk in tokens if  tk not in self.stopwords ]
    def text_clean_up(self,s=""):
        import re
        user_pattern       = '@[^\s]+'
        s=re.sub(user_pattern, "", s)
        remove = '"#$%&()*+/:;<=>@[\\]^_`{|}~”“'
        pattern = r"[{}]".format(remove)
        s=re.sub(pattern,' ', s) 
        sequencePattern   = r"(.)\1\1+"
        seqReplacePattern = r"\1\1"
        s = re.sub(sequencePattern, seqReplacePattern, s)
        s = re.sub("<[^>]*>",' ', s)
        s = re.sub("[\r\n]+",' ', s)
        s = re.sub("http\S+",' ', s)
        s = re.sub("\$[^>]*\$",' ', s)
        s = re.sub("\d+",' ', s)
        s = re.sub("\s\s+",' ', s)
        s.strip()
        return s
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X_ = X.copy()
        X_["image_path"] = (X_["imageid"].apply(lambda x: str(int(x)))+"_product_"+X_["productid"].apply(lambda x: str(int(x)))+".jpg").apply(lambda x: "image_"+x)
        for i in self.columns:
            X_[i] = X_[i].apply(lambda x: " ".join(self.rm_stopwords(self.text_clean_up(x))))
        return X_

In [60]:
def encode_reviews(tokenizer, model, device, cpu, reviews, max_length):
    token_ids = torch.tensor([])
    input_ids=torch.tensor([],dtype=torch.long)
    attention_mask=torch.tensor([],dtype=torch.long)
    h = 0
    for i, review in enumerate(reviews):
        encoded = encoded_text = tokenizer.encode_plus(
                        review,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = max_length,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
        input_ids=torch.cat((input_ids,encoded_text['input_ids']),dim=0)
        attention_mask=torch.cat((attention_mask,encoded_text['attention_mask']),dim=0)
        h+=1
        if h == 128:
            hidden = model(input_ids.to(device),attention_mask=attention_mask.to(device))[2][-2]
            token_ids=torch.cat((token_ids,torch.mean(hidden, dim=1).to(cpu)),dim=0)
            h=0
            input_ids=torch.tensor([],dtype=torch.long)
            attention_mask=torch.tensor([],dtype=torch.long)
    if input_ids != torch.tensor([],dtype=torch.long):
        hidden = model(input_ids.to(device),attention_mask=attention_mask.to(device))[2][-2]
        token_ids=torch.cat((token_ids,torch.mean(hidden, dim=1).to(cpu)),dim=0)
        print(token_ids.size())
    token_ids=token_ids.numpy()
    print(token_ids.shape)
    return token_ids

class CamembertPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, max_seq_length,column):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.cpu=torch.device("cpu")
        print(self.device)
        self.tokenizer = CamembertTokenizer.from_pretrained("camembert-base")
        self.config = CamembertConfig.from_pretrained("camembert-base", output_hidden_states=True)
        self.camembert =CamembertModel.from_pretrained("camembert-base", config=self.config).to(self.device)
        for p in self.camembert.parameters():
                p.requires_grad_(False)
        self.max_seq_length = max_seq_length
        self.column = column
        self.camembert.eval()
    def fit(self, X=None):
        return self 
    
    def transform(self, X, y=None):
        # 1. Tokenize
        X_encoded = encode_reviews(self.tokenizer,self.camembert,self.device,self.cpu, X[self.column].values, self.max_seq_length)
        return X_encoded     
    
    def fit_transform(self, X, y=None):        
        return self.transform(X, y)

In [14]:

torch.cat((torch.tensor([]),torch.tensor([1,2])),dim=0)

tensor([1., 2.])

In [15]:
class Second_to_last_SentenceEmbedding(nn.Module):
    def __init__(self):
        #super(, self).__init__()
        super().__init__()
        self.config = CamembertConfig.from_pretrained("camembert-base", output_hidden_states=True)
        self.camembert = CamembertModel.from_pretrained("camembert-base", config=self.config)
        self.dense_layer = nn.Sequential(nn.Linear(768,100),nn.ReLU(),nn.Dropout(p=0.2))

        self.fc2=nn.Linear(100,27)
        for p in self.camembert.parameters():
            p.requires_grad_(False)
    def forward(self, input,attention_mask=None):
        hidden = self.camembert(input,attention_mask=attention_mask)[2]
        token_vecs = hidden[-2]
        x = torch.mean(token_vecs,dim=1)
        h = self.dense_layer(x)
        return self.fc2(h)

In [16]:
class Columns_Selector(BaseEstimator, TransformerMixin):
    def __init__(self,column):
        self.column = column
    def fit(self, X = None):
        return self 
    def transform(self, X, y=None):
        return X[self.column]     
    def fit_transform(self, X, y=None):        
        return self.transform(X, y)

In [17]:
class To_dense(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X = None):
        return self 
    def transform(self, X, y=None):
        X = X.toarray()
        print(X.shape)
        return X 
    def fit_transform(self, X, y=None):        
        return self.transform(X, y)

In [18]:
class MultimodalTruncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Dataset(Dataset):
    """Face Landmarks dataset."""

    def __init__(self,tokenizer, input_csv_file, root_dir,max_length=60 , transform=None,Y_csv_file=None):
        """
        Args:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        if type(input_csv_file) == str:
            self.input_file = pd.read_csv(input_csv_file)
        else:
            self.input_file = input_csv_file
        self.input_file.rename(columns={"Unnamed: 0": "Id"}, inplace=True)
        self.root_dir = root_dir
        self.transform = transform
        self.max_length=max_length
        self.input_file["image_path"]=(self.input_file["imageid"].apply(lambda x: str(int(x)))+"_product_"+self.input_file["productid"].apply(lambda x: str(int(x)))+".jpg").apply(lambda x: "image_"+x)
        if Y_csv_file !=None:
            if type(Y_csv_file) == str:
                self.output = pd.read_csv(Y_csv_file)
                
            else:
                self.output = Y_csv_file
            self.output.rename(columns={"Unnamed: 0": "Id"}, inplace=True)
            self.classes = list(set(self.output["prdtypecode"].values))
        else:
            self.ouptut = None
    def __len__(self):
        return len(self.input_file)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        img_name = os.path.join(self.root_dir,
                                self.input_file["image_path"].iloc[idx])
        image = io.imread(img_name)
        text = self.input_file["designation"].iloc[idx]
        encoded_text = tokenizer.encode_plus(
                        text,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 64,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
        sample = {'image': image, 'input_ids':encoded_text['input_ids'] ,'attention_mask':encoded_text['attention_mask']}

        if self.transform:
            sample['image'] = self.transform(sample['image'])
        if self.output is None:
            return sample
        sample["label"]=self.classes.index(self.output["prdtypecode"].loc[self.input_file["Id"].iloc[idx]])
        return sample

In [19]:
class Microscope:
    """
    Cutting out the edges around the center circle of the image
    Imitating a picture, taken through the microscope

    Args:
        p (float): probability of applying an augmentation
    """

    def __init__(self, p: float = 0.5):
        self.p = p

    def __call__(self, img):
        """
        Args:
            img (PIL Image): Image to apply transformation to.

        Returns:NumPyNumPy
            PIL Image: Image with transformation.
        """
        if random.random() < self.p:
            circle = cv2.circle((np.ones(img.shape) * 255).astype(np.uint8), # image placeholder
                        (img.shape[0]//2, img.shape[1]//2), # center point of circle
                        random.randint(img.shape[0]//2 - 3, img.shape[0]//2 + 15), # radius
                        (0, 0, 0), # color
                        -1)

            mask = circle - 255
            img = np.multiply(img, mask)
        
        return img

    def __repr__(self):
        return f'{self.__class__.__name__}(p={self.p})'

In [20]:
class Feature_Extraction(nn.Module):
    # deuxieme dimension de l'output de chaque tenseurs a la sortie d'un layer de resnet
    resnet_caracteristics=[64,64,64,64,256,512,1024,2048,2048]
    pooling_target=[(4,8),(4,8),(4,8),(4,8),(2,4),(2,2),(1,2),(1,1),(1,1)]
    embedding_strategy={"Second_to_last_average":768,"Start_token_embedding":768,"last_four_embedding_average":3072}
    def __init__(self,resnet_layers=-1,to_tune=False,sentence_embedding="Second_to_last_average"):
        #super(, self).__init__()
        super().__init__()
        self.config = CamembertConfig.from_pretrained("camembert-base", output_hidden_states=True)
        self.camembert =CamembertModel.from_pretrained("camembert-base", config=self.config)
        self.Resnet = models.resnet50(pretrained=True)
        self.Resnet = nn.Sequential(*list(self.Resnet.children())[:resnet_layers])
        self.pooling = nn.AdaptiveMaxPool2d(self.pooling_target[resnet_layers-1])
        self.flat = nn.Flatten()
        self.strategy = sentence_embedding
        if sentence_embedding not in self.embedding_strategy:
            self.strategy = "Second_to_last_average"    
        if self.strategy == "Start_token_embedding":
            to_tune = True
        if to_tune == False:
            for p in self.camembert.parameters():
                p.requires_grad_(False)
            for p in self.Resnet.parameters():
                p.requires_grad_(False)
        self.output_size = 2048+ self.embedding_strategy[self.strategy]
    def sentence_embedding(self,hiddens):
        # we get the output of the second to last hidden layers and average it over all token
        if self.strategy == "Second_to_last_average":
            return torch.mean(hiddens[-2],dim=1) 
        # we use the first token embedding  from the ouptut last hidden layers 
        # fine_tune should be true 
        elif self.strategy == "Start_token_embedding":
            return hiddens[-1].permute(1,0,2)[0]
        # we use the last four hidden layer average and concatenate them
        elif self.strategy == "last_four_embedding_average":
            x=torch.cat((hiddens[-4], hiddens[-3], hiddens[-2], hiddens[-1]), dim = 2)
            return torch.mean(x, dim=1)
    def forward(self, input, image, attention_mask=None):
        hiddens = self.camembert(input,attention_mask = attention_mask)[2]
        embeddings = self.sentence_embedding(hiddens)
        x = self.Resnet(image)
        h = self.pooling(x)
        return torch.cat((embeddings, self.flat(h)),dim=1)

In [21]:
class Multimodal_Dense_model(nn.Module):
    def __init__(self,dropout=0.2,resnet_layers=-1,to_tune=False,sentence_embedding="Second_to_last_average"):
        #super(, self).__init__()
        super().__init__()
        self.feature_extractor = Feature_Extraction(resnet_layers=resnet_layers,to_tune=to_tune,sentence_embedding=sentence_embedding)
        self.input_size = self.feature_extractor.output_size
        self.dense_layer = nn.Sequential(OrderedDict([
          ('dense1', nn.Linear(in_features=self.input_size,out_features=768)),
          ('relu1', nn.ReLU()),
          ('dropout1', nn.Dropout(p=0.2)),
          ('dense2', nn.Linear(in_features=768,out_features=256)),
          ('relu2', nn.ReLU()),
          ('dropout2', nn.Dropout(p=0.2)),
          ('dense3', nn.Linear(in_features=256,out_features=64)),
          ('relu3', nn.ReLU()),
          ('dropout3', nn.Dropout(p=0.2))]))
        self.fc = nn.Linear(64,27)
    def forward(self, input,image,attention_mask=None):
        feature = self.feature_extractor(input=input,image=image,attention_mask=attention_mask)
        h = self.dense_layer(feature)
        return self.fc(h)

In [22]:
data_transforms = {
    'train': transforms.Compose([
        Microscope(p=0.5),
        transforms.ToPILImage(),
        transforms.RandomResizedCrop(size=384, scale=(0.8, 1.0)),
        transforms.RandomHorizontalFlip(),
        transforms.RandomVerticalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'val': transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'test': transforms.Compose([
        transforms.ToPILImage(),
        transforms.Resize(512),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
}

In [23]:
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")
training_set = MultimodalDataset(tokenizer=tokenizer, input_csv_file="/content/X_train_update.csv", root_dir="/content/images/images/image_train",max_length=60 , transform=data_transforms["train"],Y_csv_file="/content/Y_train_CVw08PX.csv")
def make_weights_for_balanced_classes(images, nclasses):                                                                            
    count = list(images.output["prdtypecode"].value_counts().loc[images.classes])                                    
    N = float(sum(count)) 
    weight_per_class=[0]*nclasses                                                  
    for i in range(nclasses):                                                   
        weight_per_class[i] = N/float(count[i])                                 
    weight = [0] * len(images)                                              
    for idx in range(int(N)):    
        h=images.classes.index(images.output["prdtypecode"].iloc[idx])                                      
        weight[idx] = weight_per_class[h]                                  
    return weight
weights = make_weights_for_balanced_classes(training_set, len(training_set.classes))                                                                
weights = torch.DoubleTensor(weights)                                       
sampler = torch.utils.data.sampler.WeightedRandomSampler(weights, len(weights))                     
                                                                                                                                                                        
train_dl = torch.utils.data.DataLoader(training_set, batch_size=16,                              
                                                             sampler = sampler, num_workers=4)

Baseline model
Tf-idfVectorizer + Logistics Regression (Texte)

In [27]:
X_train,X_Val,Y_train,Y_Val=train_test_split(x_train, y_train, test_size = 0.2, shuffle = True , random_state = 155)

In [28]:
target = Y_train["prdtypecode"]

In [29]:
weights=compute_class_weight("balanced", classes= np.unique(target), y=target)

In [93]:
len(h)

27

In [30]:
h=np.unique(target)
weights_dict={}
for i in range(len(h)):
    weights_dict[h[i]] = weights[i]
    

In [102]:
?LogisticRegression

In [25]:
stop_words = stopwords.words('french')
stop_words.extend(stopwords.words('english'))

In [103]:
Base_model = Pipeline(steps = [('cleaner', simple_Text_cleaner(stopwords =stop_words, columns = ["designation"])),
                               ('column_selector', Columns_Selector(column = "designation")),
                               ('Vectorization', TfidfVectorizer(stop_words = stop_words, max_features = 5000)),
                               ('to_dense', To_dense()),
                               ('scaler', StandardScaler()),
                               ('model', LogisticRegression(multi_class='multinomial',class_weight= "balanced"))
                           ])

In [104]:
Base_model.fit(X_train,target)

(67932, 5000)


Pipeline(memory=None,
         steps=[('cleaner',
                 simple_Text_cleaner(columns=['designation'],
                                     stopwords=['au', 'aux', 'avec', 'ce',
                                                'ces', 'dans', 'de', 'des',
                                                'du', 'elle', 'en', 'et', 'eux',
                                                'il', 'ils', 'je', 'la', 'le',
                                                'les', 'leur', 'lui', 'ma',
                                                'mais', 'me', 'même', 'mes',
                                                'moi', 'mon', 'ne', 'nos', ...],
                                     tokenizer=WordPunctTokenizer(pattern='\\w+|[^\\w\\s]+', gaps=False, discard_empt...
                ('to_dense', To_dense()),
                ('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('model',
                 LogisticRegression(C=1.0, class_we

In [105]:
pred=Base_model.predict(X_Val)

(16984, 5000)


In [106]:
print(classification_report(pred,Y_Val["prdtypecode"]))

              precision    recall  f1-score   support

          10       0.32      0.34      0.33       545
          40       0.59      0.59      0.59       517
          50       0.69      0.69      0.69       338
          60       0.83      0.73      0.77       175
        1140       0.60      0.60      0.60       515
        1160       0.82      0.90      0.86       719
        1180       0.53      0.40      0.46       197
        1280       0.53      0.59      0.56       928
        1281       0.45      0.34      0.39       497
        1300       0.83      0.87      0.85       989
        1301       0.89      0.87      0.88       145
        1302       0.71      0.73      0.72       480
        1320       0.63      0.60      0.61       662
        1560       0.77      0.78      0.78      1071
        1920       0.87      0.85      0.86       893
        1940       0.72      0.72      0.72       189
        2060       0.69      0.73      0.71       923
        2220       0.64    

Bert extraction De feature 

In [61]:
Simple_text_model = Pipeline(steps = [('cleaner', simple_Text_cleaner(stopwords =stop_words, columns = ["designation"])),
                               ('Vectorization', CamembertPreprocessor(max_seq_length = 64, column = "designation")),
                               ('model', LogisticRegression(multi_class='multinomial',class_weight= "balanced"))
                           ])

cuda


In [None]:
Simple_text_model.fit(X_train,target)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
